summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.appveyor.yml42
-rw-r--r--.github/ISSUE_TEMPLATE.md5
-rw-r--r--.github/ISSUE_TEMPLATE/bug-report.md20
-rw-r--r--.github/ISSUE_TEMPLATE/config.yml6
-rw-r--r--.github/ISSUE_TEMPLATE/enhancement.md11
-rw-r--r--.github/SUPPORT.md36
-rw-r--r--.gitignore1
-rw-r--r--DEDUPE-TODO19
-rwxr-xr-xFIO-VERSION-GEN2
-rw-r--r--HOWTO260
-rw-r--r--Makefile189
-rw-r--r--README9
-rw-r--r--REPORTING-BUGS31
-rw-r--r--backend.c100
-rw-r--r--cconv.c18
-rwxr-xr-xci/appveyor-install.sh43
-rwxr-xr-xci/travis-build.sh8
-rwxr-xr-xci/travis-install-librpma.sh22
-rwxr-xr-xci/travis-install-pmdk.sh28
-rwxr-xr-xci/travis-install.sh15
-rw-r--r--compiler/compiler.h4
-rwxr-xr-xconfigure356
-rw-r--r--dedupe.c28
-rw-r--r--dedupe.h6
-rw-r--r--diskutil.c16
-rw-r--r--engines/cpu.c234
-rw-r--r--engines/dfs.c583
-rw-r--r--engines/exec.c394
-rw-r--r--engines/falloc.c4
-rw-r--r--engines/filecreate.c2
-rw-r--r--engines/filedelete.c115
-rw-r--r--engines/glusterfs.c3
-rw-r--r--engines/ime.c3
-rw-r--r--engines/io_uring.c16
-rw-r--r--engines/libcufile.c627
-rw-r--r--engines/libhdfs.c4
-rw-r--r--engines/libpmem.c64
-rw-r--r--engines/librpma_apm.c256
-rw-r--r--engines/librpma_fio.c1062
-rw-r--r--engines/librpma_fio.h275
-rw-r--r--engines/librpma_gpspm.c776
-rw-r--r--engines/librpma_gpspm_flush.pb-c.c214
-rw-r--r--engines/librpma_gpspm_flush.pb-c.h120
-rw-r--r--engines/librpma_gpspm_flush.proto15
-rw-r--r--engines/libzbc.c32
-rw-r--r--engines/net.c5
-rw-r--r--engines/nfs.c314
-rw-r--r--engines/posixaio.c41
-rw-r--r--engines/rados.c19
-rw-r--r--engines/rbd.c18
-rw-r--r--engines/sg.c22
-rw-r--r--engines/skeleton_external.c14
-rw-r--r--engines/windowsaio.c8
-rw-r--r--eta.c75
-rw-r--r--examples/1mbs_clients.pngbin0 -> 121336 bytes
-rw-r--r--examples/aio-read.pngbin0 -> 76819 bytes
-rw-r--r--examples/backwards-read.pngbin0 -> 31100 bytes
-rw-r--r--examples/basic-verify.pngbin0 -> 35952 bytes
-rw-r--r--examples/butterfly.pngbin0 -> 35393 bytes
-rw-r--r--examples/cpp_null.fio2
-rw-r--r--examples/cpp_null.pngbin0 -> 34346 bytes
-rw-r--r--examples/cpuio.fio14
-rw-r--r--examples/cpuio.pngbin0 -> 52593 bytes
-rw-r--r--examples/cross-stripe-verify.fio2
-rw-r--r--examples/cross-stripe-verify.pngbin0 -> 54366 bytes
-rw-r--r--examples/dev-dax.fio4
-rw-r--r--examples/dev-dax.pngbin0 -> 93539 bytes
-rw-r--r--examples/dfs.fio33
-rw-r--r--examples/dfs.pngbin0 -> 187461 bytes
-rw-r--r--examples/disk-zone-profile.pngbin0 -> 37313 bytes
-rw-r--r--examples/e4defrag.fio2
-rw-r--r--examples/e4defrag.pngbin0 -> 97107 bytes
-rw-r--r--examples/e4defrag2.fio7
-rw-r--r--examples/e4defrag2.pngbin0 -> 222226 bytes
-rw-r--r--examples/enospc-pressure.pngbin0 -> 150373 bytes
-rw-r--r--examples/exec.fio36
-rw-r--r--examples/exec.pngbin0 -> 101933 bytes
-rw-r--r--examples/exitwhat.fio2
-rw-r--r--examples/exitwhat.pngbin0 -> 111627 bytes
-rw-r--r--examples/falloc.fio4
-rw-r--r--examples/falloc.pngbin0 -> 129273 bytes
-rw-r--r--examples/filecreate-ioengine.pngbin0 -> 59636 bytes
-rw-r--r--examples/filedelete-ioengine.fio18
-rw-r--r--examples/filedelete-ioengine.pngbin0 -> 33042 bytes
-rw-r--r--examples/filestat-ioengine.pngbin0 -> 52330 bytes
-rw-r--r--examples/fio-rand-RW.fio2
-rw-r--r--examples/fio-rand-RW.pngbin0 -> 47406 bytes
-rw-r--r--examples/fio-rand-read.fio2
-rw-r--r--examples/fio-rand-read.pngbin0 -> 36614 bytes
-rw-r--r--examples/fio-rand-write.fio2
-rw-r--r--examples/fio-rand-write.pngbin0 -> 38608 bytes
-rw-r--r--examples/fio-seq-RW.fio2
-rw-r--r--examples/fio-seq-RW.pngbin0 -> 48279 bytes
-rw-r--r--examples/fio-seq-read.fio2
-rw-r--r--examples/fio-seq-read.pngbin0 -> 37851 bytes
-rw-r--r--examples/fio-seq-write.fio2
-rw-r--r--examples/fio-seq-write.pngbin0 -> 42756 bytes
-rw-r--r--examples/fixed-rate-submission.pngbin0 -> 41703 bytes
-rw-r--r--examples/flow.pngbin0 -> 63860 bytes
-rw-r--r--examples/fsx.fio1
-rw-r--r--examples/fsx.pngbin0 -> 37310 bytes
-rw-r--r--examples/ftruncate.pngbin0 -> 56594 bytes
-rw-r--r--examples/gfapi.pngbin0 -> 46875 bytes
-rw-r--r--examples/gpudirect-rdmaio-client.pngbin0 -> 50659 bytes
-rw-r--r--examples/gpudirect-rdmaio-server.pngbin0 -> 37805 bytes
-rw-r--r--examples/http-s3.pngbin0 -> 108929 bytes
-rw-r--r--examples/http-swift.pngbin0 -> 113698 bytes
-rw-r--r--examples/http-webdav.pngbin0 -> 86857 bytes
-rw-r--r--examples/ime.pngbin0 -> 193722 bytes
-rw-r--r--examples/iometer-file-access-server.pngbin0 -> 44797 bytes
-rw-r--r--examples/jesd219.fio2
-rw-r--r--examples/jesd219.pngbin0 -> 64846 bytes
-rw-r--r--examples/latency-profile.pngbin0 -> 44487 bytes
-rw-r--r--examples/libcufile-cufile.fio42
-rw-r--r--examples/libcufile-cufile.pngbin0 -> 160611 bytes
-rw-r--r--examples/libcufile-posix.fio41
-rw-r--r--examples/libcufile-posix.pngbin0 -> 164649 bytes
-rw-r--r--examples/libhdfs.pngbin0 -> 32812 bytes
-rw-r--r--examples/libiscsi.pngbin0 -> 31649 bytes
-rw-r--r--examples/libpmem.fio39
-rw-r--r--examples/libpmem.pngbin0 -> 119668 bytes
-rw-r--r--examples/librpma_apm-client.fio24
-rw-r--r--examples/librpma_apm-client.pngbin0 -> 53792 bytes
-rw-r--r--examples/librpma_apm-server.fio26
-rw-r--r--examples/librpma_apm-server.pngbin0 -> 42611 bytes
-rw-r--r--examples/librpma_gpspm-client.fio23
-rw-r--r--examples/librpma_gpspm-client.pngbin0 -> 56398 bytes
-rw-r--r--examples/librpma_gpspm-server.fio33
-rw-r--r--examples/librpma_gpspm-server.pngbin0 -> 53793 bytes
-rw-r--r--examples/libzbc-rand-write.fio2
-rw-r--r--examples/libzbc-rand-write.pngbin0 -> 48503 bytes
-rw-r--r--examples/libzbc-seq-read.pngbin0 -> 47229 bytes
-rw-r--r--examples/mtd.fio4
-rw-r--r--examples/mtd.pngbin0 -> 79866 bytes
-rw-r--r--examples/nbd.pngbin0 -> 88667 bytes
-rw-r--r--examples/netio.pngbin0 -> 50944 bytes
-rw-r--r--examples/netio_multicast.pngbin0 -> 74921 bytes
-rw-r--r--examples/nfs.fio22
-rw-r--r--examples/nfs.pngbin0 -> 84808 bytes
-rw-r--r--examples/null.fio1
-rw-r--r--examples/null.pngbin0 -> 30223 bytes
-rw-r--r--examples/numa.pngbin0 -> 66068 bytes
-rw-r--r--examples/pmemblk.fio6
-rw-r--r--examples/pmemblk.pngbin0 -> 107529 bytes
-rw-r--r--examples/poisson-rate-submission.pngbin0 -> 41057 bytes
-rw-r--r--examples/rados.pngbin0 -> 39665 bytes
-rw-r--r--examples/rand-zones.pngbin0 -> 38297 bytes
-rw-r--r--examples/rbd.pngbin0 -> 37191 bytes
-rw-r--r--examples/rdmaio-client.pngbin0 -> 44671 bytes
-rw-r--r--examples/rdmaio-server.pngbin0 -> 31860 bytes
-rw-r--r--examples/ssd-steadystate.pngbin0 -> 71772 bytes
-rw-r--r--examples/ssd-test.pngbin0 -> 99835 bytes
-rw-r--r--examples/steadystate.fio2
-rw-r--r--examples/steadystate.pngbin0 -> 64580 bytes
-rw-r--r--examples/surface-scan.fio2
-rw-r--r--examples/surface-scan.pngbin0 -> 72042 bytes
-rw-r--r--examples/test.pngbin0 -> 30141 bytes
-rw-r--r--examples/tiobench-example.pngbin0 -> 71939 bytes
-rw-r--r--examples/waitfor.fio2
-rw-r--r--examples/waitfor.pngbin0 -> 94577 bytes
-rw-r--r--examples/zbd-rand-write.fio2
-rw-r--r--examples/zbd-rand-write.pngbin0 -> 53018 bytes
-rw-r--r--examples/zbd-seq-read.pngbin0 -> 50185 bytes
-rw-r--r--examples/zipf.pngbin0 -> 33276 bytes
-rw-r--r--file.h1
-rw-r--r--filehash.c4
-rw-r--r--filesetup.c173
-rw-r--r--fio.1325
-rw-r--r--fio.h52
-rw-r--r--fio_time.h2
-rw-r--r--flow.c8
-rw-r--r--gclient.c4
-rw-r--r--gettime-thread.c2
-rw-r--r--gettime.c98
-rw-r--r--gettime.h1
-rw-r--r--gfio.c4
-rw-r--r--goptions.c2
-rw-r--r--helper_thread.c285
-rw-r--r--init.c226
-rw-r--r--io_u.c49
-rw-r--r--ioengines.c70
-rw-r--r--ioengines.h9
-rw-r--r--iolog.c20
-rw-r--r--iolog.h1
-rw-r--r--lib/gauss.c8
-rw-r--r--lib/gauss.h3
-rw-r--r--lib/num2str.c11
-rw-r--r--lib/prio_tree.c6
-rw-r--r--lib/rand.c10
-rw-r--r--lib/rand.h10
-rw-r--r--lib/zipf.c12
-rw-r--r--lib/zipf.h6
-rw-r--r--libfio.c5
-rw-r--r--log.c2
-rw-r--r--optgroup.c16
-rw-r--r--optgroup.h8
-rw-r--r--options.c261
-rw-r--r--options.h1
-rw-r--r--os/os-aix.h6
-rw-r--r--os/os-android.h25
-rw-r--r--os/os-dragonfly.h6
-rw-r--r--os/os-freebsd.h6
-rw-r--r--os/os-hpux.h7
-rw-r--r--os/os-linux.h40
-rw-r--r--os/os-mac.h10
-rw-r--r--os/os-netbsd.h6
-rw-r--r--os/os-openbsd.h6
-rw-r--r--os/os-solaris.h6
-rw-r--r--os/os-windows-xp.h3
-rw-r--r--os/os-windows.h7
-rw-r--r--os/os.h13
-rwxr-xr-xos/windows/WixUI_Minimal_NoEULA.wxs96
-rwxr-xr-xos/windows/WixUI_fio.wxl12
-rw-r--r--os/windows/cpu-affinity.c97
-rw-r--r--os/windows/dobuild.cmd15
-rwxr-xr-xos/windows/eula.rtfbin1075 -> 0 bytes
-rwxr-xr-xos/windows/install.wxs10
-rw-r--r--os/windows/posix.c92
-rw-r--r--os/windows/posix/include/arpa/inet.h6
-rw-r--r--os/windows/posix/include/poll.h14
-rw-r--r--oslib/blkzoned.h7
-rw-r--r--oslib/getopt_long.c4
-rw-r--r--oslib/libmtd.c4
-rw-r--r--oslib/libmtd_common.h1
-rw-r--r--oslib/linux-blkzoned.c116
-rw-r--r--parse.c37
-rw-r--r--parse.h13
-rw-r--r--server.c7
-rw-r--r--server.h2
-rw-r--r--stat.c365
-rw-r--r--stat.h5
-rw-r--r--steadystate.c3
-rw-r--r--steadystate.h2
-rw-r--r--t/dedupe.c21
-rw-r--r--t/fuzz/fuzz_parseini.c41
-rw-r--r--t/fuzz/onefile.c51
-rw-r--r--t/genzipf.c6
-rw-r--r--t/io_uring.c28
-rwxr-xr-xt/latency_percentiles.py4
-rw-r--r--t/memlock.c2
-rwxr-xr-xt/run-fio-tests.py8
-rw-r--r--t/zbd/functions70
-rwxr-xr-xt/zbd/run-tests-against-nullb354
-rwxr-xr-xt/zbd/run-tests-against-regular-nullb27
-rwxr-xr-xt/zbd/run-tests-against-zoned-nullb53
-rwxr-xr-xt/zbd/test-zbd-support398
-rw-r--r--td_error.c2
-rw-r--r--thread_options.h39
-rw-r--r--tools/fiograph/fiograph.conf105
-rwxr-xr-xtools/fiograph/fiograph.py305
-rwxr-xr-xtools/plot/fio2gnuplot2
-rw-r--r--unittests/lib/num2str.c2
-rw-r--r--verify.c1
-rw-r--r--zbd.c658
-rw-r--r--zbd.h10
-rw-r--r--zbd_types.h7
256 files changed, 10380 insertions, 1558 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
index 352caeee..42b79958 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -5,33 +5,49 @@ image:
environment:
CYG_MIRROR: http://cygwin.mirror.constant.com
- CYG_ROOT: C:\cygwin64
- MAKEFLAGS: -j 2
matrix:
- - platform: x64
- PACKAGE_ARCH: x86_64
+ - ARCHITECTURE: x64
+ CC: clang
+ CONFIGURE_OPTIONS: --enable-pdb
+ DISTRO: msys2
+# Skip 32 bit clang build
+# - ARCHITECTURE: x86
+# CC: clang
+# CONFIGURE_OPTIONS: --enable-pdb
+# DISTRO: msys2
+ - ARCHITECTURE: x64
CONFIGURE_OPTIONS:
- - platform: x86
- PACKAGE_ARCH: i686
- CONFIGURE_OPTIONS: --build-32bit-win --target-win-ver=xp
+ DISTRO: cygwin
+ - ARCHITECTURE: x86
+ CONFIGURE_OPTIONS: --build-32bit-win
+ DISTRO: cygwin
install:
- - '%CYG_ROOT%\setup-x86_64.exe --quiet-mode --no-shortcuts --only-site --site "%CYG_MIRROR%" --packages "mingw64-%PACKAGE_ARCH%-zlib,mingw64-%PACKAGE_ARCH%-CUnit" > NUL'
- - SET PATH=C:\Python38-x64;%CYG_ROOT%\bin;%PATH% # NB: Changed env variables persist to later sections
+ - if %DISTRO%==cygwin (
+ SET "PATH=C:\cygwin64\bin;C:\cygwin64;%PATH%"
+ )
+ - if %DISTRO%==msys2 if %ARCHITECTURE%==x86 (
+ SET "PATH=C:\msys64\mingw32\bin;C:\msys64\usr\bin;%PATH%"
+ )
+ - if %DISTRO%==msys2 if %ARCHITECTURE%==x64 (
+ SET "PATH=C:\msys64\mingw64\bin;C:\msys64\usr\bin;%PATH%"
+ )
+ - SET PATH=C:\Python38-x64;%PATH% # NB: Changed env variables persist to later sections
- SET PYTHONUNBUFFERED=TRUE
- - python.exe -m pip install scipy six
+ - bash.exe ci\appveyor-install.sh
build_script:
- - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && ./configure --disable-native --extra-cflags=\"-Werror\" ${CONFIGURE_OPTIONS} && make.exe'
+ - bash.exe configure --extra-cflags=-Werror --disable-native %CONFIGURE_OPTIONS%
+ - make.exe -j2
after_build:
- file.exe fio.exe
- make.exe test
- - 'cd os\windows && dobuild.cmd %PLATFORM% && cd ..'
+ - 'cd os\windows && dobuild.cmd %ARCHITECTURE% && cd ..'
- ps: Get-ChildItem .\os\windows\*.msi | % { Push-AppveyorArtifact $_.FullName -FileName $_.Name -DeploymentName fio.msi }
test_script:
- - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && [ -f fio.exe ] && python.exe t/run-fio-tests.py --artifact-root test-artifacts --debug'
+ - python.exe t/run-fio-tests.py --artifact-root test-artifacts --debug
on_finish:
- 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && [ -d test-artifacts ] && 7z a -t7z test-artifacts.7z test-artifacts -xr!foo.0.0 -xr!latency.?.0 -xr!fio_jsonplus_clat2csv.test && appveyor PushArtifact test-artifacts.7z'
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 00000000..272968f8
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,5 @@
+**Please acknowledge you have done the following before creating a ticket**
+
+- [ ] I have read the GitHub issues section of [REPORTING-BUGS](../blob/master/REPORTING-BUGS).
+
+<!-- replace me with bug report / enhancement request -->
diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
new file mode 100644
index 00000000..10738165
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -0,0 +1,20 @@
+---
+name: Report a bug
+about: For bugs that are reproducible with the latest fio releases
+
+---
+
+**Please acknowledge the following before creating a ticket**
+
+- [ ] I have read the GitHub issues section of [REPORTING-BUGS](../blob/master/REPORTING-BUGS).
+
+**Description of the bug:**
+<!--replaceme-->
+
+**Environment**: <!-- Name and version of operating system -->
+
+**fio version**: <!--replaceme-->
+
+**Reproduction steps**
+<!-- Please minimise the job file/command line options down to only those
+necessary to reproduce the issue (https://stackoverflow.com/help/mcve ) -->
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..c7e3b372
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,6 @@
+blank_issues_enabled: true
+
+contact_links:
+- name: General questions (e.g. "How do I...", "Why is...") that are related to fio
+ url: http://vger.kernel.org/vger-lists.html#fio
+ about: Please send questions to the fio mailing list (plain-text emails ONLY)
diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md
new file mode 100644
index 00000000..1d4ba77d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@@ -0,0 +1,11 @@
+---
+name: Feature enhancement request
+about: Suggest a new fio feature
+labels: enhancement
+
+---
+
+**Description of the new feature**
+<!-- Please be aware regular fio developers are busy with non-fio work. Because
+of this, most requests are only completed if someone from outside the project
+contributes the code. -->
diff --git a/.github/SUPPORT.md b/.github/SUPPORT.md
new file mode 100644
index 00000000..8d9df863
--- /dev/null
+++ b/.github/SUPPORT.md
@@ -0,0 +1,36 @@
+# Getting support for fio
+
+## General questions
+
+Please use the fio mailing list for asking general fio questions (e.g. "How do
+I do X?", "Why does Y happen?"). See the Mailing list section of the
+[README][readme] for details).
+
+## Reporting bugs
+
+As mentioned in [REPORTING-BUGS][reportingbugs], fio bugs and enhancements can
+be reported to the fio mailing list or fio's GitHub issues tracker.
+
+When reporting bugs please include ALL of the following:
+- Description of the issue
+- fio version number tested. If your fio isn't among the recent releases (see
+ the [fio releases page][releases]) please build a new one from source (see
+ the Source and Building sections of the [README][readme] for how to do this)
+ and reproduce the issue with the fresh build before filing an issue.
+- Reproduction steps and minimal job file/command line parameters.
+
+When requesting an enhancement only the description is needed.
+
+### GitHub issues specific information
+
+[Formatting terminal output with markdown][quotingcode] will help people who
+are reading your report. However, if the output is large (e.g. over 15 lines
+long) please consider including it as a text attachment. Avoid attaching
+pictures of screenshots as these are not searchable/selectable.
+
+<!-- Definitions -->
+
+[readme]: ../README
+[reportingbugs]: ../REPORTING-BUGS
+[releases]: ../../../releases
+[quotingcode]: https://docs.github.com/en/free-pro-team@latest/github/writing-on-github/basic-writing-and-formatting-syntax#quoting-code
diff --git a/.gitignore b/.gitignore
index 0aa4a361..6651f96e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,4 @@ doc/output
/tags
/TAGS
/t/zbd/test-zbd-support.log.*
+/t/fuzz/fuzz_parseini
diff --git a/DEDUPE-TODO b/DEDUPE-TODO
new file mode 100644
index 00000000..1f3ee9da
--- /dev/null
+++ b/DEDUPE-TODO
@@ -0,0 +1,19 @@
+- Mixed buffers of dedupe-able and compressible data.
+ Major usecase in performance benchmarking of storage subsystems.
+
+- Shifted dedup-able data.
+ Allow for dedup buffer generation to shift contents by random number
+ of sectors (fill the gaps with uncompressible data). Some storage
+ subsystems modernized the deduplication detection algorithms to look
+ for shifted data as well. For example, some databases push a timestamp
+ on the prefix of written blocks, which makes the underlying data
+ dedup-able in different alignment. FIO should be able to simulate such
+ workload.
+
+- Generation of similar data (but not exact).
+ A rising trend in enterprise storage systems.
+ Generation of "similar" data means random uncompressible buffers
+ that differ by few(configurable number of) bits from each other.
+ The storage subsystem usually identifies the similar buffers using
+ locality-sensitive hashing or other methods.
+
diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN
index 5ee7735c..47af94e9 100755
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
#!/bin/sh
GVF=FIO-VERSION-FILE
-DEF_VER=fio-3.23
+DEF_VER=fio-3.27
LF='
'
diff --git a/HOWTO b/HOWTO
index 2d8c7a02..d4e620de 100644
--- a/HOWTO
+++ b/HOWTO
@@ -544,6 +544,9 @@ Parameter types
* *Ti* -- means tebi (Ti) or 1024**4
* *Pi* -- means pebi (Pi) or 1024**5
+ For Zone Block Device Mode:
+ * *z* -- means Zone
+
With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
from those specified in the SI and IEC 80000-13 standards to provide
compatibility with old scripts. For example, 4k means 4096.
@@ -809,6 +812,8 @@ Target file/device
**$jobname**
The name of the worker thread or process.
+ **$clientuid**
+ IP of the fio process when using client/server mode.
**$jobnum**
The incremental number of the worker thread or process.
**$filenum**
@@ -1050,6 +1055,11 @@ Target file/device
number of open zones is defined as the number of zones to which write
commands are issued.
+.. option:: job_max_open_zones=int
+
+ Limit on the number of simultaneously opened zones per single
+ thread/process.
+
.. option:: zone_reset_threshold=float
A number between zero and one that indicates the ratio of logical
@@ -1144,11 +1154,31 @@ I/O type
behaves in a similar fashion, except it sends the same offset 8 number of
times before generating a new offset.
-.. option:: unified_rw_reporting=bool
+.. option:: unified_rw_reporting=str
Fio normally reports statistics on a per data direction basis, meaning that
- reads, writes, and trims are accounted and reported separately. If this
- option is set fio sums the results and report them as "mixed" instead.
+ reads, writes, and trims are accounted and reported separately. This option
+ determines whether fio reports the results normally, summed together, or as
+ both options.
+ Accepted values are:
+
+ **none**
+ Normal statistics reporting.
+
+ **mixed**
+ Statistics are summed per data direction and reported together.
+
+ **both**
+ Statistics are reported normally, followed by the mixed statistics.
+
+ **0**
+ Backward-compatible alias for **none**.
+
+ **1**
+ Backward-compatible alias for **mixed**.
+
+ **2**
+ Alias for **both**.
.. option:: randrepeat=bool
@@ -1255,13 +1285,14 @@ I/O type
.. option:: offset=int
Start I/O at the provided offset in the file, given as either a fixed size in
- bytes or a percentage. If a percentage is given, the generated offset will be
+ bytes, zones or a percentage. If a percentage is given, the generated offset will be
aligned to the minimum ``blocksize`` or to the value of ``offset_align`` if
provided. Data before the given offset will not be touched. This
effectively caps the file size at `real_size - offset`. Can be combined with
:option:`size` to constrain the start and end range of the I/O workload.
A percentage can be specified by a number between 1 and 100 followed by '%',
- for example, ``offset=20%`` to specify 20%.
+ for example, ``offset=20%`` to specify 20%. In ZBD mode, value can be set as
+ number of zones using 'z'.
.. option:: offset_align=int
@@ -1278,7 +1309,8 @@ I/O type
intended to operate on a file in parallel disjoint segments, with even
spacing between the starting points. Percentages can be used for this option.
If a percentage is given, the generated offset will be aligned to the minimum
- ``blocksize`` or to the value of ``offset_align`` if provided.
+ ``blocksize`` or to the value of ``offset_align`` if provided. In ZBD mode, value can
+ also be set as number of zones using 'z'.
.. option:: number_ios=int
@@ -1361,7 +1393,7 @@ I/O type
limit reads or writes to a certain rate. If that is the case, then the
distribution may be skewed. Default: 50.
-.. option:: random_distribution=str:float[,str:float][,str:float]
+.. option:: random_distribution=str:float[:float][,str:float][,str:float]
By default, fio will use a completely uniform random distribution when asked
to perform random I/O. Sometimes it is useful to skew the distribution in
@@ -1396,6 +1428,14 @@ I/O type
map. For the **normal** distribution, a normal (Gaussian) deviation is
supplied as a value between 0 and 100.
+ The second, optional float is allowed for **pareto**, **zipf** and **normal** distributions.
+ It allows to set base of distribution in non-default place, giving more control
+ over most probable outcome. This value is in range [0-1] which maps linearly to
+ range of possible random values.
+ Defaults are: random for **pareto** and **zipf**, and 0.5 for **normal**.
+ If you wanted to use **zipf** with a `theta` of 1.2 centered on 1/4 of allowed value range,
+ you would use ``random_distibution=zipf:1.2:0.25``.
+
For a **zoned** distribution, fio supports specifying percentages of I/O
access that should fall within what range of the file or device. For
example, given a criteria of:
@@ -1670,6 +1710,36 @@ Buffers and memory
this option will also enable :option:`refill_buffers` to prevent every buffer
being identical.
+.. option:: dedupe_mode=str
+
+ If ``dedupe_percentage=<int>`` is given, then this option controls how fio
+ generates the dedupe buffers.
+
+ **repeat**
+ Generate dedupe buffers by repeating previous writes
+ **working_set**
+ Generate dedupe buffers from working set
+
+ ``repeat`` is the default option for fio. Dedupe buffers are generated
+ by repeating previous unique write.
+
+ ``working_set`` is a more realistic workload.
+ With ``working_set``, ``dedupe_working_set_percentage=<int>`` should be provided.
+ Given that, fio will use the initial unique write buffers as its working set.
+ Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+ Note that by using ``working_set`` the dedupe percentage will converge
+ to the desired over time while ``repeat`` maintains the desired percentage
+ throughout the job.
+
+.. option:: dedupe_working_set_percentage=int
+
+ If ``dedupe_mode=<str>`` is set to ``working_set``, then this controls
+ the percentage of size of the file or device used as the buffers
+ fio will choose to generate the dedupe buffers from
+
+ Note that size needs to be explicitly provided and only 1 file per
+ job is supported
+
.. option:: invalidate=bool
Invalidate the buffer/page cache parts of the files to be used prior to
@@ -1677,10 +1747,28 @@ Buffers and memory
This will be ignored if :option:`pre_read` is also specified for the
same job.
-.. option:: sync=bool
+.. option:: sync=str
+
+ Whether, and what type, of synchronous I/O to use for writes. The allowed
+ values are:
+
+ **none**
+ Do not use synchronous IO, the default.
+
+ **0**
+ Same as **none**.
+
+ **sync**
+ Use synchronous file IO. For the majority of I/O engines,
+ this means using O_SYNC.
+
+ **1**
+ Same as **sync**.
+
+ **dsync**
+ Use synchronous data IO. For the majority of I/O engines,
+ this means using O_DSYNC.
- Use synchronous I/O for buffered writes. For the majority of I/O engines,
- this means using O_SYNC. Default: false.
.. option:: iomem=str, mem=str
@@ -1770,7 +1858,8 @@ I/O size
If this option is not specified, fio will use the full size of the given
files or devices. If the files do not exist, size must be given. It is also
possible to give size as a percentage between 1 and 100. If ``size=20%`` is
- given, fio will use 20% of the full size of the given files or devices.
+ given, fio will use 20% of the full size of the given files or devices.
+ In ZBD mode, value can also be set as number of zones using 'z'.
Can be combined with :option:`offset` to constrain the start and end range
that I/O will be done within.
@@ -1804,7 +1893,8 @@ I/O size
.. option:: fill_device=bool, fill_fs=bool
Sets size to something really large and waits for ENOSPC (no space left on
- device) as the terminating condition. Only makes sense with sequential
+ device) or EDQUOT (disk quota exceeded)
+ as the terminating condition. Only makes sense with sequential
write. For a read workload, the mount point will be filled first then I/O
started on the result. This option doesn't make sense if operating on a raw
device node, since the size of that is already known by the file system.
@@ -1894,12 +1984,14 @@ I/O engine
**cpuio**
Doesn't transfer any data, but burns CPU cycles according to the
- :option:`cpuload` and :option:`cpuchunks` options. Setting
- :option:`cpuload`\=85 will cause that job to do nothing but burn 85%
+ :option:`cpuload`, :option:`cpuchunks` and :option:`cpumode` options.
+ Setting :option:`cpuload`\=85 will cause that job to do nothing but burn 85%
of the CPU. In case of SMP machines, use :option:`numjobs`\=<nr_of_cpu>
to get desired CPU usage, as the cpuload only loads a
single CPU at the desired rate. A job never finishes unless there is
at least one non-cpuio job.
+ Setting :option:`cpumode`\=qsort replace the default noop instructions loop
+ by a qsort algorithm to consume more energy.
**rdma**
The RDMA I/O engine supports both RDMA memory semantics
@@ -2005,6 +2097,11 @@ I/O engine
and 'nrfiles', so that files will be created.
This engine is to measure file lookup and meta data access.
+ **filedelete**
+ Simply delete the files by unlink() and do no I/O to them. You need to set 'filesize'
+ and 'nrfiles', so that the files will be created.
+ This engine is to measure file delete.
+
**libpmem**
Read and write using mmap I/O to a file on a filesystem
mounted with DAX on a persistent memory device through the PMDK
@@ -2030,6 +2127,26 @@ I/O engine
**nbd**
Read and write a Network Block Device (NBD).
+ **libcufile**
+ I/O engine supporting libcufile synchronous access to nvidia-fs and a
+ GPUDirect Storage-supported filesystem. This engine performs
+ I/O without transferring buffers between user-space and the kernel,
+ unless :option:`verify` is set or :option:`cuda_io` is `posix`.
+ :option:`iomem` must not be `cudamalloc`. This ioengine defines
+ engine specific options.
+ **dfs**
+ I/O engine supporting asynchronous read and write operations to the
+ DAOS File System (DFS) via libdfs.
+
+ **nfs**
+ I/O engine supporting asynchronous read and write operations to
+ NFS filesystems from userspace via libnfs. This is useful for
+ achieving higher concurrency and thus throughput than is possible
+ via kernel NFS.
+
+ **exec**
+ Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
+
I/O engine specific parameters
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2151,7 +2268,7 @@ with the caveat that when used on the command line, they must come after the
this will be the starting port number since fio will use a range of
ports.
- [rdma]
+ [rdma], [librpma_*]
The port to use for RDMA-CM communication. This should be the same value
on the client and the server side.
@@ -2162,6 +2279,20 @@ with the caveat that when used on the command line, they must come after the
is a TCP listener or UDP reader, the hostname is not used and must be omitted
unless it is a valid UDP multicast address.
+.. option:: serverip=str : [librpma_*]
+
+ The IP address to be used for RDMA-CM based I/O.
+
+.. option:: direct_write_to_pmem=bool : [librpma_*]
+
+ Set to 1 only when Direct Write to PMem from the remote host is possible.
+ Otherwise, set to 0.
+
+.. option:: busy_wait_polling=bool : [librpma_*_server]
+
+ Set to 0 to wait for completion instead of busy-wait polling completion.
+ Default: 1.
+
.. option:: interface=str : [netsplice] [net]
The IP address of the network interface used to send or receive UDP
@@ -2258,6 +2389,12 @@ with the caveat that when used on the command line, they must come after the
Poll store instead of waiting for completion. Usually this provides better
throughput at cost of higher(up to 100%) CPU utilization.
+.. option:: touch_objects=bool : [rados]
+
+ During initialization, touch (create if do not exist) all objects (files).
+ Touching all objects affects ceph caches and likely impacts test results.
+ Enabled by default.
+
.. option:: skip_bad=bool : [mtd]
Skip operations against known bad blocks.
@@ -2323,6 +2460,18 @@ with the caveat that when used on the command line, they must come after the
transferred to the device. The writefua option is ignored with this
selection.
+.. option:: hipri : [sg]
+
+ If this option is set, fio will attempt to use polled IO completions.
+ This will have a similar effect as (io_uring)hipri. Only SCSI READ and
+ WRITE commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor
+ VERIFY). Older versions of the Linux sg driver that do not support
+ hipri will simply ignore this flag and do normal IO. The Linux SCSI
+ Low Level Driver (LLD) that "owns" the device also needs to support
+ hipri (also known as iopoll and mq_poll). The MegaRAID driver is an
+ example of a SCSI LLD. Default: clear (0) which does normal
+ (interrupted based) IO.
+
.. option:: http_host=str : [http]
Hostname to connect to. For S3, this could be the bucket hostname.
@@ -2380,6 +2529,73 @@ with the caveat that when used on the command line, they must come after the
nbd+unix:///?socket=/tmp/socket
nbds://tlshost/exportname
+.. option:: gpu_dev_ids=str : [libcufile]
+
+ Specify the GPU IDs to use with CUDA. This is a colon-separated list of
+ int. GPUs are assigned to workers roundrobin. Default is 0.
+
+.. option:: cuda_io=str : [libcufile]
+
+ Specify the type of I/O to use with CUDA. Default is **cufile**.
+
+ **cufile**
+ Use libcufile and nvidia-fs. This option performs I/O directly
+ between a GPUDirect Storage filesystem and GPU buffers,
+ avoiding use of a bounce buffer. If :option:`verify` is set,
+ cudaMemcpy is used to copy verificaton data between RAM and GPU.
+ Verification data is copied from RAM to GPU before a write
+ and from GPU to RAM after a read. :option:`direct` must be 1.
+ **posix**
+ Use POSIX to perform I/O with a RAM buffer, and use cudaMemcpy
+ to transfer data between RAM and the GPUs. Data is copied from
+ GPU to RAM before a write and copied from RAM to GPU after a
+ read. :option:`verify` does not affect use of cudaMemcpy.
+
+.. option:: pool=str : [dfs]
+
+ Specify the UUID of the DAOS pool to connect to.
+
+.. option:: cont=str : [dfs]
+
+ Specify the UUID of the DAOS container to open.
+
+.. option:: chunk_size=int : [dfs]
+
+ Specificy a different chunk size (in bytes) for the dfs file.
+ Use DAOS container's chunk size by default.
+
+.. option:: object_class=str : [dfs]
+
+ Specificy a different object class for the dfs file.
+ Use DAOS container's object class by default.
+
+.. option:: nfs_url=str : [nfs]
+
+ URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
+ Refer to the libnfs README for more details.
+
+.. option:: program=str : [exec]
+
+ Specify the program to execute.
+
+.. option:: arguments=str : [exec]
+
+ Specify arguments to pass to program.
+ Some special variables can be expanded to pass fio's job details to the program.
+
+ **%r**
+ Replaced by the duration of the job in seconds.
+ **%n**
+ Replaced by the name of the job.
+
+.. option:: grace_time=int : [exec]
+
+ Specify the time between the SIGTERM and SIGKILL signals. Default is 1 second.
+
+.. option:: std_redirect=boot : [exec]
+
+ If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
+
I/O depth
~~~~~~~~~
@@ -2504,6 +2720,13 @@ I/O rate
before we have to complete it and do our :option:`thinktime`. In other words, this
setting effectively caps the queue depth if the latter is larger.
+.. option:: thinktime_blocks_type=str
+
+ Only valid if :option:`thinktime` is set - control how :option:`thinktime_blocks`
+ triggers. The default is `complete`, which triggers thinktime when fio completes
+ :option:`thinktime_blocks` blocks. If this is set to `issue`, then the trigger happens
+ at the issue side.
+
.. option:: rate=int[,int][,int]
Cap the bandwidth used by this job. The number is in bytes/sec, the normal
@@ -2584,11 +2807,12 @@ I/O latency
true, fio will continue running and try to meet :option:`latency_target`
by adjusting queue depth.
-.. option:: max_latency=time
+.. option:: max_latency=time[,time][,time]
If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
maximum latency. When the unit is omitted, the value is interpreted in
- microseconds.
+ microseconds. Comma-separated values may be specified for reads, writes,
+ and trims as described in :option:`blocksize`.
.. option:: rate_cycle=int
@@ -3915,7 +4139,7 @@ will be a disk utilization section.
Below is a single line containing short names for each of the fields in the
minimal output v3, separated by semicolons::
- terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+ terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth_kb;read_iops;read_runtime_ms;read_slat_min_us;read_slat_max_us;read_slat_mean_us;read_slat_dev_us;read_clat_min_us;read_clat_max_us;read_clat_mean_us;read_clat_dev_us;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min_us;read_lat_max_us;read_lat_mean_us;read_lat_dev_us;read_bw_min_kb;read_bw_max_kb;read_bw_agg_pct;read_bw_mean_kb;read_bw_dev_kb;write_kb;write_bandwidth_kb;write_iops;write_runtime_ms;write_slat_min_us;write_slat_max_us;write_slat_mean_us;write_slat_dev_us;write_clat_min_us;write_clat_max_us;write_clat_mean_us;write_clat_dev_us;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min_us;write_lat_max_us;write_lat_mean_us;write_lat_dev_us;write_bw_min_kb;write_bw_max_kb;write_bw_agg_pct;write_bw_mean_kb;write_bw_dev_kb;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
In client/server mode terse output differs from what appears when jobs are run
locally. Disk utilization data is omitted from the standard terse output and
diff --git a/Makefile b/Makefile
index b00daca2..5198f70e 100644
--- a/Makefile
+++ b/Makefile
@@ -22,16 +22,27 @@ endif
DEBUGFLAGS = -DFIO_INC_DEBUG
CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
OPTFLAGS= -g -ffast-math
-CFLAGS := -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR) $(CFLAGS)
+FIO_CFLAGS= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
LIBS += -lm $(EXTLIBS)
PROGS = fio
SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/hist/fio-histo-log-pctiles.py tools/fio_jsonplus_clat2csv)
ifndef CONFIG_FIO_NO_OPT
- CFLAGS := -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 $(CFLAGS)
+ FIO_CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2
endif
ifdef CONFIG_BUILD_NATIVE
- CFLAGS := -march=native $(CFLAGS)
+ FIO_CFLAGS += -march=native
+endif
+
+ifdef CONFIG_PDB
+ LINK_PDBFILE ?= -Wl,-pdb,$(dir $@)/$(basename $(@F)).pdb
+ FIO_CFLAGS += -gcodeview
+ LDFLAGS += -fuse-ld=lld $(LINK_PDBFILE)
+endif
+
+# If clang, do not use builtin stpcpy as it breaks the build
+ifeq ($(CC),clang)
+ FIO_CFLAGS += -fno-builtin-stpcpy
endif
ifdef CONFIG_GFIO
@@ -45,25 +56,26 @@ SOURCE := $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
pshared.c options.c \
smalloc.c filehash.c profile.c debug.c engines/cpu.c \
engines/mmap.c engines/sync.c engines/null.c engines/net.c \
- engines/ftruncate.c engines/filecreate.c engines/filestat.c \
+ engines/ftruncate.c engines/filecreate.c engines/filestat.c engines/filedelete.c \
+ engines/exec.c \
server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
gettime-thread.c helpers.c json.c idletime.c td_error.c \
profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
workqueue.c rate-submit.c optgroup.c helper_thread.c \
- steadystate.c zone-dist.c zbd.c
+ steadystate.c zone-dist.c zbd.c dedupe.c
ifdef CONFIG_LIBHDFS
HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
- HDFSLIB= -Wl,-rpath $(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server -L$(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server $(FIO_LIBHDFS_LIB)/libhdfs.a -ljvm
- CFLAGS := $(HDFSFLAGS) $(CFLAGS)
+ HDFSLIB= -Wl,-rpath $(JAVA_HOME)/lib/$(FIO_HDFS_CPU)/server -L$(JAVA_HOME)/lib/$(FIO_HDFS_CPU)/server $(FIO_LIBHDFS_LIB)/libhdfs.a -ljvm
+ FIO_CFLAGS += $(HDFSFLAGS)
SOURCE += engines/libhdfs.c
endif
ifdef CONFIG_LIBISCSI
- iscsi_SRCS = engines/libiscsi.c
- iscsi_LIBS = $(LIBISCSI_LIBS)
- iscsi_CFLAGS = $(LIBISCSI_CFLAGS)
- ENGINES += iscsi
+ libiscsi_SRCS = engines/libiscsi.c
+ libiscsi_LIBS = $(LIBISCSI_LIBS)
+ libiscsi_CFLAGS = $(LIBISCSI_CFLAGS)
+ ENGINES += libiscsi
endif
ifdef CONFIG_LIBNBD
@@ -73,27 +85,42 @@ ifdef CONFIG_LIBNBD
ENGINES += nbd
endif
-ifdef CONFIG_64BIT
- CFLAGS := -DBITS_PER_LONG=64 $(CFLAGS)
+ifdef CONFIG_LIBNFS
+ CFLAGS += $(LIBNFS_CFLAGS)
+ LIBS += $(LIBNFS_LIBS)
+ SOURCE += engines/nfs.c
endif
-ifdef CONFIG_32BIT
- CFLAGS := -DBITS_PER_LONG=32 $(CFLAGS)
+
+ifdef CONFIG_64BIT
+ CPPFLAGS += -DBITS_PER_LONG=64
+else ifdef CONFIG_32BIT
+ CPPFLAGS += -DBITS_PER_LONG=32
endif
ifdef CONFIG_LIBAIO
- aio_SRCS = engines/libaio.c
- aio_LIBS = -laio
- ifdef CONFIG_LIBAIO_URING
- aio_LIBS = -luring
- else
- aio_LIBS = -laio
- endif
- ENGINES += aio
+ libaio_SRCS = engines/libaio.c
+ libaio_LIBS = -laio
+ ENGINES += libaio
endif
ifdef CONFIG_RDMA
rdma_SRCS = engines/rdma.c
rdma_LIBS = -libverbs -lrdmacm
ENGINES += rdma
endif
+ifdef CONFIG_LIBRPMA_APM
+ librpma_apm_SRCS = engines/librpma_apm.c
+ librpma_fio_SRCS = engines/librpma_fio.c
+ librpma_apm_LIBS = -lrpma -lpmem
+ ENGINES += librpma_apm
+endif
+ifdef CONFIG_LIBRPMA_GPSPM
+ librpma_gpspm_SRCS = engines/librpma_gpspm.c engines/librpma_gpspm_flush.pb-c.c
+ librpma_fio_SRCS = engines/librpma_fio.c
+ librpma_gpspm_LIBS = -lrpma -lpmem -lprotobuf-c
+ ENGINES += librpma_gpspm
+endif
+ifdef librpma_fio_SRCS
+ SOURCE += $(librpma_fio_SRCS)
+endif
ifdef CONFIG_POSIXAIO
SOURCE += engines/posixaio.c
endif
@@ -103,6 +130,9 @@ endif
ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT
SOURCE += engines/e4defrag.c
endif
+ifdef CONFIG_LIBCUFILE
+ SOURCE += engines/libcufile.c
+endif
ifdef CONFIG_LINUX_SPLICE
SOURCE += engines/splice.c
endif
@@ -127,6 +157,11 @@ ifdef CONFIG_HTTP
http_LIBS = -lcurl -lssl -lcrypto
ENGINES += http
endif
+ifdef CONFIG_DFS
+ dfs_SRCS = engines/dfs.c
+ dfs_LIBS = -luuid -ldaos -ldfs
+ ENGINES += dfs
+endif
SOURCE += oslib/asprintf.c
ifndef CONFIG_STRSEP
SOURCE += oslib/strsep.c
@@ -155,7 +190,7 @@ ifdef CONFIG_GFAPI
SOURCE += engines/glusterfs_async.c
LIBS += -lgfapi -lglusterfs
ifdef CONFIG_GF_FADVISE
- CFLAGS := "-DGFAPI_USE_FADVISE" $(CFLAGS)
+ FIO_CFLAGS += "-DGFAPI_USE_FADVISE"
endif
endif
ifdef CONFIG_MTD
@@ -174,17 +209,17 @@ ifdef CONFIG_LINUX_DEVDAX
ENGINES += dev-dax
endif
ifdef CONFIG_LIBPMEM
- pmem_SRCS = engines/libpmem.c
- pmem_LIBS = -lpmem
- ENGINES += pmem
+ libpmem_SRCS = engines/libpmem.c
+ libpmem_LIBS = -lpmem
+ ENGINES += libpmem
endif
ifdef CONFIG_IME
SOURCE += engines/ime.c
endif
ifdef CONFIG_LIBZBC
- zbc_SRCS = engines/libzbc.c
- zbc_LIBS = -lzbc
- ENGINES += zbc
+ libzbc_SRCS = engines/libzbc.c
+ libzbc_LIBS = -lzbc
+ ENGINES += libzbc
endif
ifeq ($(CONFIG_TARGET_OS), Linux)
@@ -234,7 +269,7 @@ ifeq ($(CONFIG_TARGET_OS), AIX)
endif
ifeq ($(CONFIG_TARGET_OS), HP-UX)
LIBS += -lpthread -ldl -lrt
- CFLAGS := -D_LARGEFILE64_SOURCE -D_XOPEN_SOURCE_EXTENDED $(CFLAGS)
+ FIO_CFLAGS += -D_LARGEFILE64_SOURCE -D_XOPEN_SOURCE_EXTENDED
endif
ifeq ($(CONFIG_TARGET_OS), Darwin)
LIBS += -lpthread -ldl
@@ -243,7 +278,7 @@ ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
SOURCE += os/windows/cpu-affinity.c os/windows/posix.c
WINDOWS_OBJS = os/windows/cpu-affinity.o os/windows/posix.o lib/hweight.o
LIBS += -lpthread -lpsapi -lws2_32 -lssp
- CFLAGS := -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format $(CFLAGS)
+ FIO_CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format
endif
ifdef CONFIG_DYNAMIC_ENGINES
@@ -251,19 +286,24 @@ ifdef CONFIG_DYNAMIC_ENGINES
define engine_template =
$(1)_OBJS := $$($(1)_SRCS:.c=.o)
$$($(1)_OBJS): CFLAGS := -fPIC $$($(1)_CFLAGS) $(CFLAGS)
-engines/lib$(1).so: $$($(1)_OBJS)
- $$(QUIET_LINK)$(CC) -shared -rdynamic -fPIC -Wl,-soname,lib$(1).so.1 $$($(1)_LIBS) -o $$@ $$<
-ENGS_OBJS += engines/lib$(1).so
-all install: $(ENGS_OBJS)
+engines/fio-$(1).so: $$($(1)_OBJS)
+ $$(QUIET_LINK)$(CC) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
+ENGS_OBJS += engines/fio-$(1).so
endef
else # !CONFIG_DYNAMIC_ENGINES
define engine_template =
SOURCE += $$($(1)_SRCS)
LIBS += $$($(1)_LIBS)
-CFLAGS := $$($(1)_CFLAGS) $(CFLAGS)
+override CFLAGS += $$($(1)_CFLAGS)
endef
endif
+FIO-VERSION-FILE: FORCE
+ @$(SHELL) $(SRCDIR)/FIO-VERSION-GEN
+-include FIO-VERSION-FILE
+
+override CFLAGS := -DFIO_VERSION='"$(FIO_VERSION)"' $(FIO_CFLAGS) $(CFLAGS)
+
$(foreach eng,$(ENGINES),$(eval $(call engine_template,$(eng))))
OBJS := $(SOURCE:.c=.o)
@@ -338,6 +378,23 @@ T_MEMLOCK_PROGS = t/memlock
T_TT_OBJS = t/time-test.o
T_TT_PROGS = t/time-test
+T_FUZZ_OBJS = t/fuzz/fuzz_parseini.o
+T_FUZZ_OBJS += $(OBJS)
+ifdef CONFIG_ARITHMETIC
+T_FUZZ_OBJS += lex.yy.o y.tab.o
+endif
+# in case there is no fuzz driver defined by environment variable LIB_FUZZING_ENGINE, use a simple one
+# For instance, with compiler clang, address sanitizer and libFuzzer as a fuzzing engine, you should define
+# export CFLAGS="-fsanitize=address,fuzzer-no-link"
+# export LIB_FUZZING_ENGINE="-fsanitize=address"
+# export CC=clang
+# before running configure && make
+# You can adapt this with different compilers, sanitizers, and fuzzing engines
+ifndef LIB_FUZZING_ENGINE
+T_FUZZ_OBJS += t/fuzz/onefile.o
+endif
+T_FUZZ_PROGS = t/fuzz/fuzz_parseini
+
T_OBJS = $(T_SMALLOC_OBJS)
T_OBJS += $(T_IEEE_OBJS)
T_OBJS += $(T_ZIPF_OBJS)
@@ -351,6 +408,7 @@ T_OBJS += $(T_PIPE_ASYNC_OBJS)
T_OBJS += $(T_MEMLOCK_OBJS)
T_OBJS += $(T_TT_OBJS)
T_OBJS += $(T_IOU_RING_OBJS)
+T_OBJS += $(T_FUZZ_OBJS)
ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
T_DEDUPE_OBJS += $(WINDOWS_OBJS)
@@ -374,6 +432,7 @@ endif
ifneq (,$(findstring Linux,$(CONFIG_TARGET_OS)))
T_TEST_PROGS += $(T_IOU_RING_PROGS)
endif
+T_TEST_PROGS += $(T_FUZZ_PROGS)
PROGS += $(T_PROGS)
@@ -427,17 +486,11 @@ mandir = $(prefix)/man
sharedir = $(prefix)/share/fio
endif
-all: $(PROGS) $(T_TEST_PROGS) $(UT_PROGS) $(SCRIPTS) FORCE
+all: $(PROGS) $(T_TEST_PROGS) $(UT_PROGS) $(SCRIPTS) $(ENGS_OBJS) FORCE
.PHONY: all install clean test
.PHONY: FORCE cscope
-FIO-VERSION-FILE: FORCE
- @$(SHELL) $(SRCDIR)/FIO-VERSION-GEN
--include FIO-VERSION-FILE
-
-override CFLAGS := -DFIO_VERSION='"$(FIO_VERSION)"' $(CFLAGS)
-
%.o : %.c
@mkdir -p $(dir $@)
$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
@@ -478,7 +531,7 @@ lexer.h: lex.yy.c
exp/test-expression-parser.o: exp/test-expression-parser.c
$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
exp/test-expression-parser: exp/test-expression-parser.o
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) $< y.tab.o lex.yy.o -o $@ $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) $< y.tab.o lex.yy.o -o $@ $(LIBS)
parse.o: lex.yy.o y.tab.o
endif
@@ -514,55 +567,62 @@ printing.o: printing.c printing.h
t/io_uring.o: os/linux/io_uring.h
t/io_uring: $(T_IOU_RING_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_IOU_RING_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_IOU_RING_OBJS) $(LIBS)
t/read-to-pipe-async: $(T_PIPE_ASYNC_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_PIPE_ASYNC_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_PIPE_ASYNC_OBJS) $(LIBS)
t/memlock: $(T_MEMLOCK_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_MEMLOCK_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_MEMLOCK_OBJS) $(LIBS)
t/stest: $(T_SMALLOC_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_SMALLOC_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_SMALLOC_OBJS) $(LIBS)
t/ieee754: $(T_IEEE_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_IEEE_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_IEEE_OBJS) $(LIBS)
fio: $(FIO_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(FIO_OBJS) $(LIBS) $(HDFSLIB)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(FIO_OBJS) $(LIBS) $(HDFSLIB)
+
+t/fuzz/fuzz_parseini: $(T_FUZZ_OBJS)
+ifndef LIB_FUZZING_ENGINE
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_FUZZ_OBJS) $(LIBS) $(HDFSLIB)
+else
+ $(QUIET_LINK)$(CXX) $(LDFLAGS) -o $@ $(T_FUZZ_OBJS) $(LIB_FUZZING_ENGINE) $(LIBS) $(HDFSLIB)
+endif
gfio: $(GFIO_OBJS)
$(QUIET_LINK)$(CC) $(filter-out -static, $(LDFLAGS)) -o gfio $(GFIO_OBJS) $(LIBS) $(GFIO_LIBS) $(GTK_LDFLAGS) $(HDFSLIB)
t/fio-genzipf: $(T_ZIPF_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_ZIPF_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_ZIPF_OBJS) $(LIBS)
t/axmap: $(T_AXMAP_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_AXMAP_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_AXMAP_OBJS) $(LIBS)
t/lfsr-test: $(T_LFSR_TEST_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_LFSR_TEST_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_LFSR_TEST_OBJS) $(LIBS)
t/gen-rand: $(T_GEN_RAND_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_GEN_RAND_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_GEN_RAND_OBJS) $(LIBS)
ifeq ($(CONFIG_TARGET_OS), Linux)
t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
endif
t/fio-dedupe: $(T_DEDUPE_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
t/fio-verify-state: $(T_VS_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
t/time-test: $(T_TT_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_TT_OBJS) $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_TT_OBJS) $(LIBS)
ifdef CONFIG_HAVE_CUNIT
unittests/unittest: $(UT_OBJS) $(UT_TARGET_OBJS)
- $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(UT_OBJS) $(UT_TARGET_OBJS) -lcunit $(LIBS)
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(UT_OBJS) $(UT_TARGET_OBJS) -lcunit $(LIBS)
endif
clean: FORCE
@@ -591,19 +651,20 @@ test: fio
fulltest:
sudo modprobe null_blk && \
if [ ! -e /usr/include/libzbc/zbc.h ]; then \
- git clone https://github.com/hgst/libzbc && \
+ git clone https://github.com/westerndigitalcorporation/libzbc && \
(cd libzbc && \
./autogen.sh && \
./configure --prefix=/usr && \
make -j && \
sudo make install) \
fi && \
- sudo t/zbd/run-tests-against-regular-nullb && \
+ sudo t/zbd/run-tests-against-nullb -s 1 && \
if [ -e /sys/module/null_blk/parameters/zoned ]; then \
- sudo t/zbd/run-tests-against-zoned-nullb; \
+ sudo t/zbd/run-tests-against-nullb -s 2; \
+ sudo t/zbd/run-tests-against-nullb -s 4; \
fi
-install: $(PROGS) $(SCRIPTS) tools/plot/fio2gnuplot.1 FORCE
+install: $(PROGS) $(SCRIPTS) $(ENGS_OBJS) tools/plot/fio2gnuplot.1 FORCE
$(INSTALL) -m 755 -d $(DESTDIR)$(bindir)
$(INSTALL) $(PROGS) $(SCRIPTS) $(DESTDIR)$(bindir)
ifdef CONFIG_DYNAMIC_ENGINES
diff --git a/README b/README
index 0f943bcc..2fecf0e0 100644
--- a/README
+++ b/README
@@ -164,8 +164,9 @@ configure.
Windows
~~~~~~~
-On Windows, Cygwin (https://www.cygwin.com/) is required in order to build
-fio. To create an MSI installer package install WiX from
+The minimum versions of Windows for building/runing fio are Windows 7/Windows
+Server 2008 R2. On Windows, Cygwin (https://www.cygwin.com/) is required in
+order to build fio. To create an MSI installer package install WiX from
https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
directory.
@@ -181,9 +182,7 @@ How to compile fio on 64-bit Windows:
To build fio for 32-bit Windows, ensure the -i686 versions of the previously
mentioned -x86_64 packages are installed and run ``./configure
---build-32bit-win`` before ``make``. To build an fio that supports versions of
-Windows below Windows 7/Windows Server 2008 R2 also add ``--target-win-ver=xp``
-to the end of the configure line that you run before doing ``make``.
+--build-32bit-win`` before ``make``.
It's recommended that once built or installed, fio be run in a Command Prompt or
other 'native' console such as console2, since there are known to be display and
diff --git a/REPORTING-BUGS b/REPORTING-BUGS
index 327b6caa..c0204d7e 100644
--- a/REPORTING-BUGS
+++ b/REPORTING-BUGS
@@ -1,16 +1,20 @@
Reporting a bug
---------------
-If you notice anything that seems like a fio bug, please do send email
-to the list (fio@vger.kernel.org, see README) about it. If you are not
-running the newest release of fio, upgrading first is recommended.
+...via the mailing list
+=======================
+
+If you notice anything that seems like a fio bug or want to ask fio related
+questions, please send a plain-text only email to the list
+(fio@vger.kernel.org, see README) about it. If you are not running the newest
+release of fio please upgrade first.
When reporting a bug, you'll need to include:
1) A description of what you think the bug is
-2) Environment (Linux distro version, kernel version). This is mostly
+2) Environment (e.g. Linux distro version, kernel version). This is mostly
needed if it's a build bug.
-3) The output from fio --version.
+3) The output from fio --version .
4) How to reproduce. Please include a full list of the parameters
passed to fio and the job file used (if any).
@@ -19,3 +23,20 @@ is left out and has to be asked for will add to the turn-around time
of getting to the bottom of the issue, and an eventual fix.
That's it!
+
+...via GitHub issues
+====================
+
+Please create an issue in the GitHub issue tracker
+(https://github.com/axboe/fio/issues ) but observe the following:
+
+a) If you are asking a question on how to do something ("How do I/Why is?")
+ please send it to the mailing list and not GitHub issues. The fio project
+ uses GitHub issues for reproducible bugs/enhancement requests.
+b) Please reproduce your bug using the latest fio listed on
+ https://github.com/axboe/fio/releases (see the Source and Building sections
+ of the README for how to build fio from source).
+c) Include all of the information requested in the mailing list section above
+ (description, environment, version, reproduction steps and all job parameters).
+
+Thanks!
diff --git a/backend.c b/backend.c
index f91f3caf..808e4362 100644
--- a/backend.c
+++ b/backend.c
@@ -62,8 +62,9 @@ struct io_log *agg_io_log[DDIR_RWDIR_CNT];
int groupid = 0;
unsigned int thread_number = 0;
+unsigned int nr_segments = 0;
+unsigned int cur_segment = 0;
unsigned int stat_number = 0;
-int shm_id = 0;
int temp_stall_ts;
unsigned long done_secs = 0;
#ifdef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
@@ -76,7 +77,7 @@ pthread_mutex_t overlap_check = PTHREAD_MUTEX_INITIALIZER;
static void sig_int(int sig)
{
- if (threads) {
+ if (nr_segments) {
if (is_backend)
fio_server_got_signal(sig);
else {
@@ -392,7 +393,7 @@ static bool break_on_this_error(struct thread_data *td, enum fio_ddir ddir,
td_clear_error(td);
*retptr = 0;
return false;
- } else if (td->o.fill_device && err == ENOSPC) {
+ } else if (td->o.fill_device && (err == ENOSPC || err == EDQUOT)) {
/*
* We expect to hit this error if
* fill_device option is set.
@@ -438,7 +439,7 @@ static int wait_for_completions(struct thread_data *td, struct timespec *time)
if ((full && !min_evts) || !td->o.iodepth_batch_complete_min)
min_evts = 1;
- if (time && __should_check_rate(td))
+ if (time && should_check_rate(td))
fio_gettime(time, NULL);
do {
@@ -493,7 +494,7 @@ int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
requeue_io_u(td, &io_u);
} else {
sync_done:
- if (comp_time && __should_check_rate(td))
+ if (comp_time && should_check_rate(td))
fio_gettime(comp_time, NULL);
*ret = io_u_sync_complete(td, io_u);
@@ -857,14 +858,15 @@ static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
return 0;
}
-static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir)
+static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir,
+ struct timespec *time)
{
unsigned long long b;
uint64_t total;
int left;
- b = ddir_rw_sum(td->io_blocks);
- if (b % td->o.thinktime_blocks)
+ b = ddir_rw_sum(td->thinktime_blocks_counter);
+ if (b % td->o.thinktime_blocks || !b)
return;
io_u_quiesce(td);
@@ -897,6 +899,9 @@ static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir)
/* adjust for rate_process=poisson */
td->last_usec[ddir] += total;
}
+
+ if (time && should_check_rate(td))
+ fio_gettime(time, NULL);
}
/*
@@ -1075,6 +1080,10 @@ reap:
}
if (ret < 0)
break;
+
+ if (ddir_rw(ddir) && td->o.thinktime)
+ handle_thinktime(td, ddir, &comp_time);
+
if (!ddir_rw_sum(td->bytes_done) &&
!td_ioengine_flagged(td, FIO_NOIO))
continue;
@@ -1089,9 +1098,6 @@ reap:
}
if (!in_ramp_time(td) && td->o.latency_target)
lat_target_check(td);
-
- if (ddir_rw(ddir) && td->o.thinktime)
- handle_thinktime(td, ddir);
}
check_update_rusage(td);
@@ -1099,7 +1105,7 @@ reap:
if (td->trim_entries)
log_err("fio: %lu trim entries leaked?\n", td->trim_entries);
- if (td->o.fill_device && td->error == ENOSPC) {
+ if (td->o.fill_device && (td->error == ENOSPC || td->error == EDQUOT)) {
td->error = 0;
fio_mark_td_terminate(td);
}
@@ -1114,7 +1120,8 @@ reap:
if (i) {
ret = io_u_queued_complete(td, i);
- if (td->o.fill_device && td->error == ENOSPC)
+ if (td->o.fill_device &&
+ (td->error == ENOSPC || td->error == EDQUOT))
td->error = 0;
}
@@ -1335,22 +1342,19 @@ int init_io_u_buffers(struct thread_data *td)
return 0;
}
+#ifdef FIO_HAVE_IOSCHED_SWITCH
/*
- * This function is Linux specific.
+ * These functions are Linux specific.
* FIO_HAVE_IOSCHED_SWITCH enabled currently means it's Linux.
*/
-static int switch_ioscheduler(struct thread_data *td)
+static int set_ioscheduler(struct thread_data *td, struct fio_file *file)
{
-#ifdef FIO_HAVE_IOSCHED_SWITCH
char tmp[256], tmp2[128], *p;
FILE *f;
int ret;
- if (td_ioengine_flagged(td, FIO_DISKLESSIO))
- return 0;
-
- assert(td->files && td->files[0]);
- sprintf(tmp, "%s/queue/scheduler", td->files[0]->du->sysfs_root);
+ assert(file->du && file->du->sysfs_root);
+ sprintf(tmp, "%s/queue/scheduler", file->du->sysfs_root);
f = fopen(tmp, "r+");
if (!f) {
@@ -1403,7 +1407,7 @@ static int switch_ioscheduler(struct thread_data *td)
sprintf(tmp2, "[%s]", td->o.ioscheduler);
if (!strstr(tmp, tmp2)) {
- log_err("fio: io scheduler %s not found\n", td->o.ioscheduler);
+ log_err("fio: unable to set io scheduler to %s\n", td->o.ioscheduler);
td_verror(td, EINVAL, "iosched_switch");
fclose(f);
return 1;
@@ -1411,11 +1415,55 @@ static int switch_ioscheduler(struct thread_data *td)
fclose(f);
return 0;
+}
+
+static int switch_ioscheduler(struct thread_data *td)
+{
+ struct fio_file *f;
+ unsigned int i;
+ int ret = 0;
+
+ if (td_ioengine_flagged(td, FIO_DISKLESSIO))
+ return 0;
+
+ assert(td->files && td->files[0]);
+
+ for_each_file(td, f, i) {
+
+ /* Only consider regular files and block device files */
+ switch (f->filetype) {
+ case FIO_TYPE_FILE:
+ case FIO_TYPE_BLOCK:
+ /*
+ * Make sure that the device hosting the file could
+ * be determined.
+ */
+ if (!f->du)
+ continue;
+ break;
+ case FIO_TYPE_CHAR:
+ case FIO_TYPE_PIPE:
+ default:
+ continue;
+ }
+
+ ret = set_ioscheduler(td, f);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
#else
+
+static int switch_ioscheduler(struct thread_data *td)
+{
return 0;
-#endif
}
+#endif /* FIO_HAVE_IOSCHED_SWITCH */
+
static bool keep_running(struct thread_data *td)
{
unsigned long long limit;
@@ -1743,6 +1791,11 @@ static void *thread_main(void *data)
if (rate_submit_init(td, sk_out))
goto err;
+ if (td->o.thinktime_blocks_type == THINKTIME_BLOCKS_TYPE_COMPLETE)
+ td->thinktime_blocks_counter = td->io_blocks;
+ else
+ td->thinktime_blocks_counter = td->io_issues;
+
set_epoch_time(td, o->log_unix_epoch);
fio_getrusage(&td->ru_start);
memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
@@ -2526,6 +2579,7 @@ int fio_backend(struct sk_out *sk_out)
for_each_td(td, i) {
steadystate_free(td);
fio_options_free(td);
+ fio_dump_options_free(td);
if (td->rusage_sem) {
fio_sem_remove(td->rusage_sem);
td->rusage_sem = NULL;
diff --git a/cconv.c b/cconv.c
index 488dd799..e3a8c27c 100644
--- a/cconv.c
+++ b/cconv.c
@@ -143,6 +143,8 @@ void convert_thread_options_to_cpu(struct thread_options *o,
o->rate_iops_min[i] = le32_to_cpu(top->rate_iops_min[i]);
o->perc_rand[i] = le32_to_cpu(top->perc_rand[i]);
+
+ o->max_latency[i] = le64_to_cpu(top->max_latency[i]);
}
o->ratecycle = le32_to_cpu(top->ratecycle);
@@ -203,12 +205,14 @@ void convert_thread_options_to_cpu(struct thread_options *o,
o->zipf_theta.u.f = fio_uint64_to_double(le64_to_cpu(top->zipf_theta.u.i));
o->pareto_h.u.f = fio_uint64_to_double(le64_to_cpu(top->pareto_h.u.i));
o->gauss_dev.u.f = fio_uint64_to_double(le64_to_cpu(top->gauss_dev.u.i));
+ o->random_center.u.f = fio_uint64_to_double(le64_to_cpu(top->random_center.u.i));
o->random_generator = le32_to_cpu(top->random_generator);
o->hugepage_size = le32_to_cpu(top->hugepage_size);
o->rw_min_bs = le64_to_cpu(top->rw_min_bs);
o->thinktime = le32_to_cpu(top->thinktime);
o->thinktime_spin = le32_to_cpu(top->thinktime_spin);
o->thinktime_blocks = le32_to_cpu(top->thinktime_blocks);
+ o->thinktime_blocks_type = le32_to_cpu(top->thinktime_blocks_type);
o->fsync_blocks = le32_to_cpu(top->fsync_blocks);
o->fdatasync_blocks = le32_to_cpu(top->fdatasync_blocks);
o->barrier_blocks = le32_to_cpu(top->barrier_blocks);
@@ -227,6 +231,8 @@ void convert_thread_options_to_cpu(struct thread_options *o,
o->zone_capacity = le64_to_cpu(top->zone_capacity);
o->zone_skip = le64_to_cpu(top->zone_skip);
o->zone_mode = le32_to_cpu(top->zone_mode);
+ o->max_open_zones = __le32_to_cpu(top->max_open_zones);
+ o->ignore_zone_limits = le32_to_cpu(top->ignore_zone_limits);
o->lockmem = le64_to_cpu(top->lockmem);
o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent);
o->offset_increment = le64_to_cpu(top->offset_increment);
@@ -287,12 +293,13 @@ void convert_thread_options_to_cpu(struct thread_options *o,
o->sync_file_range = le32_to_cpu(top->sync_file_range);
o->latency_target = le64_to_cpu(top->latency_target);
o->latency_window = le64_to_cpu(top->latency_window);
- o->max_latency = le64_to_cpu(top->max_latency);
o->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(top->latency_percentile.u.i));
o->latency_run = le32_to_cpu(top->latency_run);
o->compress_percentage = le32_to_cpu(top->compress_percentage);
o->compress_chunk = le32_to_cpu(top->compress_chunk);
o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
+ o->dedupe_mode = le32_to_cpu(top->dedupe_mode);
+ o->dedupe_working_set_percentage = le32_to_cpu(top->dedupe_working_set_percentage);
o->block_error_hist = le32_to_cpu(top->block_error_hist);
o->replay_align = le32_to_cpu(top->replay_align);
o->replay_scale = le32_to_cpu(top->replay_scale);
@@ -423,12 +430,14 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
top->zipf_theta.u.i = __cpu_to_le64(fio_double_to_uint64(o->zipf_theta.u.f));
top->pareto_h.u.i = __cpu_to_le64(fio_double_to_uint64(o->pareto_h.u.f));
top->gauss_dev.u.i = __cpu_to_le64(fio_double_to_uint64(o->gauss_dev.u.f));
+ top->random_center.u.i = __cpu_to_le64(fio_double_to_uint64(o->random_center.u.f));
top->random_generator = cpu_to_le32(o->random_generator);
top->hugepage_size = cpu_to_le32(o->hugepage_size);
top->rw_min_bs = __cpu_to_le64(o->rw_min_bs);
top->thinktime = cpu_to_le32(o->thinktime);
top->thinktime_spin = cpu_to_le32(o->thinktime_spin);
top->thinktime_blocks = cpu_to_le32(o->thinktime_blocks);
+ top->thinktime_blocks_type = __cpu_to_le32(o->thinktime_blocks_type);
top->fsync_blocks = cpu_to_le32(o->fsync_blocks);
top->fdatasync_blocks = cpu_to_le32(o->fdatasync_blocks);
top->barrier_blocks = cpu_to_le32(o->barrier_blocks);
@@ -487,12 +496,13 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
top->sync_file_range = cpu_to_le32(o->sync_file_range);
top->latency_target = __cpu_to_le64(o->latency_target);
top->latency_window = __cpu_to_le64(o->latency_window);
- top->max_latency = __cpu_to_le64(o->max_latency);
top->latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(o->latency_percentile.u.f));
top->latency_run = __cpu_to_le32(o->latency_run);
top->compress_percentage = cpu_to_le32(o->compress_percentage);
top->compress_chunk = cpu_to_le32(o->compress_chunk);
top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
+ top->dedupe_mode = cpu_to_le32(o->dedupe_mode);
+ top->dedupe_working_set_percentage = cpu_to_le32(o->dedupe_working_set_percentage);
top->block_error_hist = cpu_to_le32(o->block_error_hist);
top->replay_align = cpu_to_le32(o->replay_align);
top->replay_scale = cpu_to_le32(o->replay_scale);
@@ -546,6 +556,8 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
top->rate_iops_min[i] = cpu_to_le32(o->rate_iops_min[i]);
top->perc_rand[i] = cpu_to_le32(o->perc_rand[i]);
+
+ top->max_latency[i] = __cpu_to_le64(o->max_latency[i]);
}
memcpy(top->verify_pattern, o->verify_pattern, MAX_PATTERN_SIZE);
@@ -567,6 +579,8 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
top->zone_capacity = __cpu_to_le64(o->zone_capacity);
top->zone_skip = __cpu_to_le64(o->zone_skip);
top->zone_mode = __cpu_to_le32(o->zone_mode);
+ top->max_open_zones = __cpu_to_le32(o->max_open_zones);
+ top->ignore_zone_limits = cpu_to_le32(o->ignore_zone_limits);
top->lockmem = __cpu_to_le64(o->lockmem);
top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add);
top->file_size_low = __cpu_to_le64(o->file_size_low);
diff --git a/ci/appveyor-install.sh b/ci/appveyor-install.sh
new file mode 100755
index 00000000..3137f39e
--- /dev/null
+++ b/ci/appveyor-install.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# The PATH to appropriate distro commands must already be set before invoking
+# this script
+# The following environment variables must be set:
+# PLATFORM={i686,x64}
+# DISTRO={cygwin,msys2}
+# The following environment can optionally be set:
+# CYG_MIRROR=<URL>
+set -eu
+
+case "${ARCHITECTURE}" in
+ "x64")
+ PACKAGE_ARCH="x86_64"
+ ;;
+ "x86")
+ PACKAGE_ARCH="i686"
+ ;;
+esac
+
+echo "Installing packages..."
+case "${DISTRO}" in
+ "cygwin")
+ CYG_MIRROR=${CYG_MIRROR:-"http://cygwin.mirror.constant.com"}
+ setup-x86_64.exe --quiet-mode --no-shortcuts --only-site \
+ --site "${CYG_MIRROR}" --packages \
+ "mingw64-${PACKAGE_ARCH}-CUnit,mingw64-${PACKAGE_ARCH}-zlib"
+ ;;
+ "msys2")
+ #pacman --noconfirm -Syuu # MSYS2 core update
+ #pacman --noconfirm -Syuu # MSYS2 normal update
+ pacman.exe --noconfirm -S \
+ mingw-w64-${PACKAGE_ARCH}-clang \
+ mingw-w64-${PACKAGE_ARCH}-cunit \
+ mingw-w64-${PACKAGE_ARCH}-toolchain \
+ mingw-w64-${PACKAGE_ARCH}-lld
+ pacman.exe -Q # List installed packages
+ ;;
+esac
+
+python.exe -m pip install scipy six
+
+echo "Python3 path: $(type -p python3 2>&1)"
+echo "Python3 version: $(python3 -V 2>&1)"
diff --git a/ci/travis-build.sh b/ci/travis-build.sh
index 231417e2..923d882d 100755
--- a/ci/travis-build.sh
+++ b/ci/travis-build.sh
@@ -1,8 +1,9 @@
#!/bin/bash
+set -eu
CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}"
EXTRA_CFLAGS="-Werror"
-PYTHONUNBUFFERED=TRUE
+export PYTHONUNBUFFERED=TRUE
CONFIGURE_FLAGS=()
case "$TRAVIS_OS_NAME" in
@@ -11,6 +12,7 @@ case "$TRAVIS_OS_NAME" in
case "$CI_TARGET_ARCH" in
"x86")
EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32"
+ export LDFLAGS="-m32"
;;
"amd64")
CONFIGURE_FLAGS+=(--enable-cuda)
@@ -24,7 +26,7 @@ CONFIGURE_FLAGS+=(--extra-cflags="${EXTRA_CFLAGS}")
make &&
make test &&
if [[ "$CI_TARGET_ARCH" == "arm64" ]]; then
- sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
+ sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
else
- sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
+ sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
fi
diff --git a/ci/travis-install-librpma.sh b/ci/travis-install-librpma.sh
new file mode 100755
index 00000000..b127f3f5
--- /dev/null
+++ b/ci/travis-install-librpma.sh
@@ -0,0 +1,22 @@
+#!/bin/bash -e
+
+# 11.02.2021 Merge pull request #866 from ldorau/rpma-mmap-memory-for-rpma_mr_reg-in-rpma_flush_apm_new
+LIBRPMA_VERSION=fbac593917e98f3f26abf14f4fad5a832b330f5c
+ZIP_FILE=rpma.zip
+
+WORKDIR=$(pwd)
+
+# install librpma
+wget -O $ZIP_FILE https://github.com/pmem/rpma/archive/${LIBRPMA_VERSION}.zip
+unzip $ZIP_FILE
+mkdir -p rpma-${LIBRPMA_VERSION}/build
+cd rpma-${LIBRPMA_VERSION}/build
+cmake .. -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_INSTALL_PREFIX=/usr \
+ -DBUILD_DOC=OFF \
+ -DBUILD_EXAMPLES=OFF \
+ -DBUILD_TESTS=OFF
+make -j$(nproc)
+sudo make -j$(nproc) install
+cd $WORKDIR
+rm -rf $ZIP_FILE rpma-${LIBRPMA_VERSION}
diff --git a/ci/travis-install-pmdk.sh b/ci/travis-install-pmdk.sh
new file mode 100755
index 00000000..803438f8
--- /dev/null
+++ b/ci/travis-install-pmdk.sh
@@ -0,0 +1,28 @@
+#!/bin/bash -e
+
+# pmdk v1.9.1 release
+PMDK_VERSION=1.9.1
+
+WORKDIR=$(pwd)
+
+#
+# The '/bin/sh' shell used by PMDK's 'make install'
+# does not know the exact localization of clang
+# and fails with:
+# /bin/sh: 1: clang: not found
+# if CC is not set to the full path of clang.
+#
+export CC=$(which $CC)
+
+# Install PMDK libraries, because PMDK's libpmem
+# is a dependency of the librpma fio engine.
+# Install it from a release package
+# with already generated documentation,
+# in order to not install 'pandoc'.
+wget https://github.com/pmem/pmdk/releases/download/${PMDK_VERSION}/pmdk-${PMDK_VERSION}.tar.gz
+tar -xzf pmdk-${PMDK_VERSION}.tar.gz
+cd pmdk-${PMDK_VERSION}
+make -j$(nproc) NDCTL_ENABLE=n
+sudo make -j$(nproc) install prefix=/usr NDCTL_ENABLE=n
+cd $WORKDIR
+rm -rf pmdk-${PMDK_VERSION}
diff --git a/ci/travis-install.sh b/ci/travis-install.sh
index b6895e82..4c4c04c5 100755
--- a/ci/travis-install.sh
+++ b/ci/travis-install.sh
@@ -1,5 +1,5 @@
#!/bin/bash
-set -e
+set -eu
CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}"
case "$TRAVIS_OS_NAME" in
@@ -43,6 +43,16 @@ case "$TRAVIS_OS_NAME" in
)
sudo apt-get -qq update
sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}"
+ # librpma is supported on the amd64 (x86_64) architecture for now
+ if [[ $CI_TARGET_ARCH == "amd64" ]]; then
+ # install libprotobuf-c-dev required by librpma_gpspm
+ sudo apt-get install --no-install-recommends -qq -y libprotobuf-c-dev
+ # PMDK libraries have to be installed, because
+ # libpmem is a dependency of the librpma fio engine
+ ci/travis-install-pmdk.sh
+ # install librpma from sources from GitHub
+ ci/travis-install-librpma.sh
+ fi
;;
"osx")
brew update >/dev/null 2>&1
@@ -51,6 +61,5 @@ case "$TRAVIS_OS_NAME" in
;;
esac
-echo "Python version: $(/usr/bin/python -V 2>&1)"
-echo "Python3 path: $(which python3 2>&1)"
+echo "Python3 path: $(type -p python3 2>&1)"
echo "Python3 version: $(python3 -V 2>&1)"
diff --git a/compiler/compiler.h b/compiler/compiler.h
index 8988236c..44fa87b9 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -62,8 +62,8 @@
#endif
#ifdef FIO_INTERNAL
-#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
-#define FIELD_SIZE(s, f) (sizeof(((__typeof__(s))0)->f))
+#define FIO_ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
+#define FIO_FIELD_SIZE(s, f) (sizeof(((__typeof__(s))0)->f))
#endif
#ifndef __has_attribute
diff --git a/configure b/configure
index 08571fb0..84ccce04 100755
--- a/configure
+++ b/configure
@@ -45,6 +45,7 @@ print_config() {
# Default CFLAGS
CFLAGS="-D_GNU_SOURCE -include config-host.h $CFLAGS"
+CONFIGURE_CFLAGS="-Werror-implicit-function-declaration"
BUILD_CFLAGS=""
# Print a helpful header at the top of config.log
@@ -88,14 +89,14 @@ do_cc() {
}
compile_object() {
- do_cc $CFLAGS -Werror-implicit-function-declaration -c -o $TMPO $TMPC
+ do_cc $CFLAGS $CONFIGURE_CFLAGS -c -o $TMPO $TMPC
}
compile_prog() {
local_cflags="$1"
local_ldflags="$2 $LIBS"
echo "Compiling test case $3" >> config.log
- do_cc $CFLAGS -Werror-implicit-function-declaration $local_cflags -o $TMPE $TMPC $LDFLAGS $local_ldflags
+ do_cc $CFLAGS $CONFIGURE_CFLAGS $local_cflags -o $TMPE $TMPC $LDFLAGS $local_ldflags
}
feature_not_found() {
@@ -141,7 +142,7 @@ check_min_lib_version() {
fi
: "${_feature:=${1}}"
if "${cross_prefix}"pkg-config --version > /dev/null 2>&1; then
- if eval "echo \$$_feature" = "yes" ; then
+ if test "$(eval echo \"\$$_feature\")" = "yes" ; then
feature_not_found "$_feature" "$1 >= $2"
fi
else
@@ -162,14 +163,16 @@ pmemblk="no"
devdax="no"
pmem="no"
cuda="no"
+libcufile="no"
disable_lex=""
disable_pmem="no"
disable_native="no"
march_set="no"
libiscsi="no"
libnbd="no"
-libaio_uring="no"
+libnfs="no"
libzbc=""
+dfs=""
dynamic_engines="no"
prefix=/usr/local
@@ -193,6 +196,8 @@ for opt do
;;
--target-win-ver=*) target_win_ver="$optarg"
;;
+ --enable-pdb) pdb="yes"
+ ;;
--build-static) build_static="yes"
;;
--enable-gfio) gfio_check="yes"
@@ -223,6 +228,8 @@ for opt do
;;
--enable-cuda) cuda="yes"
;;
+ --enable-libcufile) libcufile="yes"
+ ;;
--disable-native) disable_native="yes"
;;
--with-ime=*) ime_path="$optarg"
@@ -235,10 +242,12 @@ for opt do
;;
--disable-tcmalloc) disable_tcmalloc="yes"
;;
- --enable-libaio-uring) libaio_uring="yes"
+ --disable-nfs) disable_nfs="yes"
;;
--dynamic-libengines) dynamic_engines="yes"
;;
+ --disable-dfs) dfs="no"
+ ;;
--help)
show_help="yes"
;;
@@ -255,7 +264,8 @@ if test "$show_help" = "yes" ; then
echo "--cc= Specify compiler to use"
echo "--extra-cflags= Specify extra CFLAGS to pass to compiler"
echo "--build-32bit-win Enable 32-bit build on Windows"
- echo "--target-win-ver= Minimum version of Windows to target (XP or 7)"
+ echo "--target-win-ver= Minimum version of Windows to target (only accepts 7)"
+ echo "--enable-pdb Enable Windows PDB symbols generation (needs clang/lld)"
echo "--build-static Build a static fio"
echo "--esx Configure build options for esx"
echo "--enable-gfio Enable building of gtk gfio"
@@ -264,22 +274,25 @@ if test "$show_help" = "yes" ; then
echo "--disable-rados Disable Rados support even if found"
echo "--disable-rbd Disable Rados Block Device even if found"
echo "--disable-http Disable HTTP support even if found"
+ echo "--disable-nfs Disable userspace NFS support even if found"
echo "--disable-gfapi Disable gfapi"
echo "--enable-libhdfs Enable hdfs support"
+ echo "--enable-libnfs Enable nfs support"
echo "--disable-lex Disable use of lex/yacc for math"
echo "--disable-pmem Disable pmem based engines even if found"
echo "--enable-lex Enable use of lex/yacc for math"
echo "--disable-shm Disable SHM support"
echo "--disable-optimizations Don't enable compiler optimizations"
echo "--enable-cuda Enable GPUDirect RDMA support"
+ echo "--enable-libcufile Enable GPUDirect Storage cuFile support"
echo "--disable-native Don't build for native host"
echo "--with-ime= Install path for DDN's Infinite Memory Engine"
echo "--enable-libiscsi Enable iscsi support"
echo "--enable-libnbd Enable libnbd (NBD engine) support"
echo "--disable-libzbc Disable libzbc even if found"
echo "--disable-tcmalloc Disable tcmalloc support"
- echo "--enable-libaio-uring Enable libaio emulated over io_uring"
echo "--dynamic-libengines Lib-based ioengines as dynamic libraries"
+ echo "--disable-dfs Disable DAOS File System support even if found"
exit $exit_val
fi
@@ -357,16 +370,15 @@ Darwin)
if test -z "$cpu" && test "$(sysctl -n hw.optional.x86_64)" = "1"; then
cpu="x86_64"
fi
- # Error at compile time linking of weak/partial symbols if possible...
+ # Avoid configure feature detection of features provided by weak symbols
cat > $TMPC <<EOF
int main(void)
{
return 0;
}
EOF
- if compile_prog "" "-Wl,-no_weak_imports" "disable weak symbols"; then
- echo "Disabling weak symbols"
- LDFLAGS="$LDFLAGS -Wl,-no_weak_imports"
+ if compile_prog "" "-Werror=partial-availability" "error on weak symbols"; then
+ CONFIGURE_CFLAGS="$CONFIGURE_CFLAGS -Werror=partial-availability"
fi
;;
SunOS)
@@ -392,9 +404,7 @@ CYGWIN*)
# Default Windows API target
target_win_ver="7"
fi
- if test "$target_win_ver" = "XP"; then
- output_sym "CONFIG_WINDOWS_XP"
- elif test "$target_win_ver" = "7"; then
+ if test "$target_win_ver" = "7"; then
output_sym "CONFIG_WINDOWS_7"
CFLAGS="$CFLAGS -D_WIN32_WINNT=0x0601"
else
@@ -412,6 +422,8 @@ CYGWIN*)
clock_gettime="yes" # clock_monotonic probe has dependency on this
clock_monotonic="yes"
sched_idle="yes"
+ pthread_condattr_setclock="no"
+ pthread_affinity="no"
;;
esac
@@ -648,22 +660,13 @@ int main(void)
return 0;
}
EOF
- if test "$libaio_uring" = "yes"; then
- if compile_prog "" "-luring" "libaio io_uring" ; then
- libaio=yes
- LIBS="-luring $LIBS"
- else
- feature_not_found "libaio io_uring" ""
- fi
- elif compile_prog "" "-laio" "libaio" ; then
+ if compile_prog "" "-laio" "libaio" ; then
libaio=yes
- libaio_uring=no
else
if test "$libaio" = "yes" ; then
feature_not_found "linux AIO" "libaio-dev or libaio-devel"
fi
libaio=no
- libaio_uring=no
fi
cat > $TMPC <<EOF
@@ -684,7 +687,6 @@ EOF
fi
print_config "Linux AIO support" "$libaio"
print_config "Linux AIO support rw flags" "$libaio_rw_flags"
-print_config "Linux AIO over io_uring" "$libaio_uring"
##########################################
# posix aio probe
@@ -767,10 +769,8 @@ print_config "POSIX pshared support" "$posix_pshared"
##########################################
# POSIX pthread_condattr_setclock() probe
-if test "$pthread_condattr_setclock" != "yes" ; then
- pthread_condattr_setclock="no"
-fi
-cat > $TMPC <<EOF
+if test "$pthread_condattr_setclock" != "no" ; then
+ cat > $TMPC <<EOF
#include <pthread.h>
int main(void)
{
@@ -779,11 +779,12 @@ int main(void)
return 0;
}
EOF
-if compile_prog "" "$LIBS" "pthread_condattr_setclock" ; then
- pthread_condattr_setclock=yes
-elif compile_prog "" "$LIBS -lpthread" "pthread_condattr_setclock" ; then
- pthread_condattr_setclock=yes
- LIBS="$LIBS -lpthread"
+ if compile_prog "" "$LIBS" "pthread_condattr_setclock" ; then
+ pthread_condattr_setclock=yes
+ elif compile_prog "" "$LIBS -lpthread" "pthread_condattr_setclock" ; then
+ pthread_condattr_setclock=yes
+ LIBS="$LIBS -lpthread"
+ fi
fi
print_config "pthread_condattr_setclock()" "$pthread_condattr_setclock"
@@ -809,6 +810,29 @@ fi
print_config "pthread_sigmask()" "$pthread_sigmask"
##########################################
+# pthread_getaffinity_np() probe
+if test "$pthread_getaffinity" != "yes" ; then
+ pthread_getaffinity="no"
+fi
+cat > $TMPC <<EOF
+#include <stddef.h> /* NULL */
+#include <signal.h> /* pthread_sigmask() */
+#include <pthread.h>
+int main(void)
+{
+ cpu_set_t set;
+ return pthread_getaffinity_np(pthread_self(), sizeof(set), &set);
+}
+EOF
+if compile_prog "" "$LIBS" "pthread_getaffinity" ; then
+ pthread_getaffinity="yes"
+elif compile_prog "" "$LIBS -lpthread" "pthread_getaffinity" ; then
+ pthread_getaffinity="yes"
+ LIBS="$LIBS -lpthread"
+fi
+print_config "pthread_getaffinity_np()" "$pthread_getaffinity"
+
+##########################################
# solaris aio probe
if test "$solaris_aio" != "yes" ; then
solaris_aio="no"
@@ -930,6 +954,49 @@ fi
print_config "rdmacm" "$rdmacm"
##########################################
+# librpma probe
+if test "$librpma" != "yes" ; then
+ librpma="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <librpma.h>
+int main(int argc, char **argv)
+{
+ enum rpma_conn_event event = RPMA_CONN_REJECTED;
+ (void) event; /* unused */
+ rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
+ return 0;
+}
+EOF
+if test "$disable_rdma" != "yes" && compile_prog "" "-lrpma" "rpma"; then
+ librpma="yes"
+fi
+print_config "librpma" "$librpma"
+
+##########################################
+# libprotobuf-c probe
+if test "$libprotobuf_c" != "yes" ; then
+ libprotobuf_c="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <protobuf-c/protobuf-c.h>
+#if !defined(PROTOBUF_C_VERSION_NUMBER)
+# error PROTOBUF_C_VERSION_NUMBER is not defined!
+#endif
+int main(int argc, char **argv)
+{
+ (void)protobuf_c_message_check(NULL);
+ return 0;
+}
+EOF
+if compile_prog "" "-lprotobuf-c" "protobuf_c"; then
+ libprotobuf_c="yes"
+fi
+print_config "libprotobuf_c" "$libprotobuf_c"
+
+##########################################
# asprintf() and vasprintf() probes
if test "$have_asprintf" != "yes" ; then
have_asprintf="no"
@@ -939,7 +1006,8 @@ cat > $TMPC << EOF
int main(int argc, char **argv)
{
- return asprintf(NULL, "%s", "str") == 0;
+ char *buf;
+ return asprintf(&buf, "%s", "str") == 0;
}
EOF
if compile_prog "" "" "have_asprintf"; then
@@ -956,7 +1024,8 @@ cat > $TMPC << EOF
int main(int argc, char **argv)
{
va_list ap;
- return vasprintf(NULL, "%s", ap) == 0;
+ char *buf;
+ return vasprintf(&buf, "%s", ap) == 0;
}
EOF
if compile_prog "" "" "have_vasprintf"; then
@@ -1098,46 +1167,6 @@ fi
print_config "CLOCK_MONOTONIC" "$clock_monotonic"
##########################################
-# CLOCK_MONOTONIC_RAW probe
-if test "$clock_monotonic_raw" != "yes" ; then
- clock_monotonic_raw="no"
-fi
-if test "$clock_gettime" = "yes" ; then
- cat > $TMPC << EOF
-#include <stdio.h>
-#include <time.h>
-int main(int argc, char **argv)
-{
- return clock_gettime(CLOCK_MONOTONIC_RAW, NULL);
-}
-EOF
- if compile_prog "" "$LIBS" "clock monotonic"; then
- clock_monotonic_raw="yes"
- fi
-fi
-print_config "CLOCK_MONOTONIC_RAW" "$clock_monotonic_raw"
-
-##########################################
-# CLOCK_MONOTONIC_PRECISE probe
-if test "$clock_monotonic_precise" != "yes" ; then
- clock_monotonic_precise="no"
-fi
-if test "$clock_gettime" = "yes" ; then
- cat > $TMPC << EOF
-#include <stdio.h>
-#include <time.h>
-int main(int argc, char **argv)
-{
- return clock_gettime(CLOCK_MONOTONIC_PRECISE, NULL);
-}
-EOF
- if compile_prog "" "$LIBS" "clock monotonic precise"; then
- clock_monotonic_precise="yes"
- fi
-fi
-print_config "CLOCK_MONOTONIC_PRECISE" "$clock_monotonic_precise"
-
-##########################################
# clockid_t probe
if test "$clockid_t" != "yes" ; then
clockid_t="no"
@@ -2102,7 +2131,7 @@ cat > $TMPC << EOF
int main(int argc, char **argv)
{
int rc;
- rc = pmem_is_pmem(NULL, NULL);
+ rc = pmem_is_pmem(NULL, 0);
return 0;
}
EOF
@@ -2227,6 +2256,48 @@ fi
print_config "NBD engine" "$libnbd"
##########################################
+# check for dfs (DAOS File System)
+if test "$dfs" != "no" ; then
+ cat > $TMPC << EOF
+#include <fcntl.h>
+#include <daos.h>
+#include <daos_fs.h>
+
+int main(int argc, char **argv)
+{
+ daos_handle_t poh;
+ daos_handle_t coh;
+ dfs_t *dfs;
+
+ (void) dfs_mount(poh, coh, O_RDWR, &dfs);
+
+ return 0;
+}
+EOF
+ if compile_prog "" "-luuid -ldfs -ldaos" "dfs"; then
+ dfs="yes"
+ else
+ dfs="no"
+ fi
+fi
+print_config "DAOS File System (dfs) Engine" "$dfs"
+
+##########################################
+# Check if we have libnfs (for userspace nfs support).
+if test "$disable_nfs" != "yes"; then
+ if $(pkg-config libnfs > /dev/null 2>&1); then
+ libnfs="yes"
+ libnfs_cflags=$(pkg-config --cflags libnfs)
+ libnfs_libs=$(pkg-config --libs libnfs)
+ else
+ if test "$libnfs" = "yes" ; then
+ echo "libnfs" "Install libnfs"
+ fi
+ fi
+fi
+print_config "NFS engine" "$libnfs"
+
+##########################################
# Check if we have lex/yacc available
yacc="no"
yacc_is_bison="no"
@@ -2234,19 +2305,14 @@ lex="no"
arith="no"
if test "$disable_lex" = "no" || test -z "$disable_lex" ; then
if test "$targetos" != "SunOS" ; then
-LEX=$(which lex 2> /dev/null)
-if test -x "$LEX" ; then
+if has lex; then
lex="yes"
fi
-YACC=$(which bison 2> /dev/null)
-if test -x "$YACC" ; then
+if has bison; then
yacc="yes"
yacc_is_bison="yes"
-else
- YACC=$(which yacc 2> /dev/null)
- if test -x "$YACC" ; then
- yacc="yes"
- fi
+elif has yacc; then
+ yacc="yes"
fi
if test "$yacc" = "yes" && test "$lex" = "yes" ; then
arith="yes"
@@ -2262,7 +2328,9 @@ int main(int argc, char **argv)
return 0;
}
EOF
-if compile_prog "" "-ll" "lex"; then
+if compile_prog "" "-lfl" "flex"; then
+ LIBS="-lfl $LIBS"
+elif compile_prog "" "-ll" "lex"; then
LIBS="-ll $LIBS"
else
arith="no"
@@ -2276,8 +2344,7 @@ if test "$arith" = "yes" ; then
if test "$force_no_lex_o" = "yes" ; then
lex_use_o="no"
else
-$LEX -o lex.yy.c exp/expression-parser.l 2> /dev/null
-if test "$?" = "0" ; then
+if lex -o lex.yy.c exp/expression-parser.l 2> /dev/null; then
lex_use_o="yes"
else
lex_use_o="no"
@@ -2547,6 +2614,29 @@ fi
print_config "cuda" "$cuda"
##########################################
+# libcufile probe
+if test "$libcufile" != "no" ; then
+cat > $TMPC << EOF
+#include <cufile.h>
+
+int main(int argc, char* argv[]) {
+ cuFileDriverOpen();
+ return 0;
+}
+EOF
+ if compile_prog "" "-lcuda -lcudart -lcufile" "libcufile"; then
+ libcufile="yes"
+ LIBS="-lcuda -lcudart -lcufile $LIBS"
+ else
+ if test "$libcufile" = "yes" ; then
+ feature_not_found "libcufile" ""
+ fi
+ libcufile="no"
+ fi
+fi
+print_config "libcufile" "$libcufile"
+
+##########################################
# check for cc -march=native
build_native="no"
cat > $TMPC << EOF
@@ -2698,6 +2788,47 @@ if compile_prog "" "" "statx_syscall"; then
fi
print_config "statx(2)/syscall" "$statx_syscall"
+##########################################
+# check for Windows PDB generation support
+if test "pdb" != "no" ; then
+ cat > $TMPC <<EOF
+int main(void)
+{
+ return 0;
+}
+EOF
+ if compile_prog "-g -gcodeview" "-fuse-ld=lld -Wl,-pdb,$TMPO" "pdb"; then
+ pdb=yes
+ else
+ if test "$pdb" = "yes"; then
+ feature_not_found "PDB" "clang and lld"
+ fi
+ pdb=no
+ fi
+else
+ pdb=no
+fi
+print_config "Windows PDB generation" "$pdb"
+
+##########################################
+# check for timerfd support
+timerfd_create="no"
+if test "$esx" != "yes" ; then
+cat > $TMPC << EOF
+#include <sys/time.h>
+#include <sys/timerfd.h>
+
+int main(int argc, char **argv)
+{
+ return timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+}
+EOF
+ if compile_prog "" "" "timerfd_create"; then
+ timerfd_create="yes"
+ fi
+fi
+print_config "timerfd_create" "$timerfd_create"
+
#############################################################################
if test "$wordsize" = "64" ; then
@@ -2720,9 +2851,6 @@ if test "$libaio" = "yes" ; then
if test "$libaio_rw_flags" = "yes" ; then
output_sym "CONFIG_LIBAIO_RW_FLAGS"
fi
- if test "$libaio_uring" = "yes" ; then
- output_sym "CONFIG_LIBAIO_URING"
- fi
fi
if test "$posix_aio" = "yes" ; then
output_sym "CONFIG_POSIXAIO"
@@ -2739,6 +2867,9 @@ fi
if test "$pthread_sigmask" = "yes" ; then
output_sym "CONFIG_PTHREAD_SIGMASK"
fi
+if test "$pthread_getaffinity" = "yes" ; then
+ output_sym "CONFIG_PTHREAD_GETAFFINITY"
+fi
if test "$have_asprintf" = "yes" ; then
output_sym "CONFIG_HAVE_ASPRINTF"
fi
@@ -2778,18 +2909,21 @@ fi
if test "$libverbs" = "yes" -a "$rdmacm" = "yes" ; then
output_sym "CONFIG_RDMA"
fi
+# librpma is supported on the 'x86_64' architecture for now
+if test "$cpu" = "x86_64" -a "$libverbs" = "yes" -a "$rdmacm" = "yes" \
+ -a "$librpma" = "yes" -a "$libpmem" = "yes" ; then
+ output_sym "CONFIG_LIBRPMA_APM"
+fi
+if test "$cpu" = "x86_64" -a "$libverbs" = "yes" -a "$rdmacm" = "yes" \
+ -a "$librpma" = "yes" -a "$libpmem" = "yes" -a "$libprotobuf_c" = "yes" ; then
+ output_sym "CONFIG_LIBRPMA_GPSPM"
+fi
if test "$clock_gettime" = "yes" ; then
output_sym "CONFIG_CLOCK_GETTIME"
fi
if test "$clock_monotonic" = "yes" ; then
output_sym "CONFIG_CLOCK_MONOTONIC"
fi
-if test "$clock_monotonic_raw" = "yes" ; then
- output_sym "CONFIG_CLOCK_MONOTONIC_RAW"
-fi
-if test "$clock_monotonic_precise" = "yes" ; then
- output_sym "CONFIG_CLOCK_MONOTONIC_PRECISE"
-fi
if test "$clockid_t" = "yes"; then
output_sym "CONFIG_CLOCKID_T"
fi
@@ -2931,9 +3065,9 @@ fi
if test "$arith" = "yes" ; then
output_sym "CONFIG_ARITHMETIC"
if test "$yacc_is_bison" = "yes" ; then
- echo "YACC=$YACC -y" >> $config_host_mak
+ echo "YACC=bison -y" >> $config_host_mak
else
- echo "YACC=$YACC" >> $config_host_mak
+ echo "YACC=yacc" >> $config_host_mak
fi
if test "$lex_use_o" = "yes" ; then
echo "CONFIG_LEX_USE_O=y" >> $config_host_mak
@@ -2981,6 +3115,15 @@ fi
if test "$cuda" = "yes" ; then
output_sym "CONFIG_CUDA"
fi
+if test "$libcufile" = "yes" ; then
+ output_sym "CONFIG_LIBCUFILE"
+fi
+if test "$dfs" = "yes" ; then
+ output_sym "CONFIG_DFS"
+fi
+if test "$libnfs" = "yes" ; then
+ output_sym "CONFIG_NFS"
+fi
if test "$march_set" = "no" && test "$build_native" = "yes" ; then
output_sym "CONFIG_BUILD_NATIVE"
fi
@@ -2999,6 +3142,9 @@ fi
if test "$statx_syscall" = "yes"; then
output_sym "CONFIG_HAVE_STATX_SYSCALL"
fi
+if test "$timerfd_create" = "yes"; then
+ output_sym "CONFIG_HAVE_TIMERFD_CREATE"
+fi
if test "$fallthrough" = "yes"; then
CFLAGS="$CFLAGS -Wimplicit-fallthrough"
fi
@@ -3017,9 +3163,19 @@ if test "$libnbd" = "yes" ; then
echo "LIBNBD_CFLAGS=$libnbd_cflags" >> $config_host_mak
echo "LIBNBD_LIBS=$libnbd_libs" >> $config_host_mak
fi
+if test "$libnfs" = "yes" ; then
+ output_sym "CONFIG_LIBNFS"
+ echo "CONFIG_LIBNFS=m" >> $config_host_mak
+ echo "LIBNFS_CFLAGS=$libnfs_cflags" >> $config_host_mak
+ echo "LIBNFS_LIBS=$libnfs_libs" >> $config_host_mak
+fi
if test "$dynamic_engines" = "yes" ; then
output_sym "CONFIG_DYNAMIC_ENGINES"
fi
+if test "$pdb" = yes; then
+ output_sym "CONFIG_PDB"
+fi
+
print_config "Lib-based ioengines dynamic" "$dynamic_engines"
cat > $TMPC << EOF
int main(int argc, char **argv)
diff --git a/dedupe.c b/dedupe.c
new file mode 100644
index 00000000..043a376c
--- /dev/null
+++ b/dedupe.c
@@ -0,0 +1,28 @@
+#include "fio.h"
+
+int init_dedupe_working_set_seeds(struct thread_data *td)
+{
+ unsigned long long i;
+ struct frand_state dedupe_working_set_state = {0};
+
+ if (!td->o.dedupe_percentage || !(td->o.dedupe_mode == DEDUPE_MODE_WORKING_SET))
+ return 0;
+
+ /*
+ * The dedupe working set keeps seeds of unique data (generated by buf_state).
+ * Dedupe-ed pages will be generated using those seeds.
+ */
+ td->num_unique_pages = (td->o.size * (unsigned long long)td->o.dedupe_working_set_percentage / 100) / td->o.min_bs[DDIR_WRITE];
+ td->dedupe_working_set_states = malloc(sizeof(struct frand_state) * td->num_unique_pages);
+ if (!td->dedupe_working_set_states) {
+ log_err("fio: could not allocate dedupe working set\n");
+ return 1;
+ }
+ frand_copy(&dedupe_working_set_state, &td->buf_state);
+ for (i = 0; i < td->num_unique_pages; i++) {
+ frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state);
+ __get_next_seed(&dedupe_working_set_state);
+ }
+
+ return 0;
+}
diff --git a/dedupe.h b/dedupe.h
new file mode 100644
index 00000000..d4c4dc37
--- /dev/null
+++ b/dedupe.h
@@ -0,0 +1,6 @@
+#ifndef DEDUPE_H
+#define DEDUPE_H
+
+int init_dedupe_working_set_seeds(struct thread_data *td);
+
+#endif
diff --git a/diskutil.c b/diskutil.c
index 6c6380bb..ace7af3d 100644
--- a/diskutil.c
+++ b/diskutil.c
@@ -166,14 +166,10 @@ static int get_device_numbers(char *file_name, int *maj, int *min)
if (S_ISBLK(st.st_mode)) {
majdev = major(st.st_rdev);
mindev = minor(st.st_rdev);
- } else if (S_ISCHR(st.st_mode)) {
- majdev = major(st.st_rdev);
- mindev = minor(st.st_rdev);
- if (fio_lookup_raw(st.st_rdev, &majdev, &mindev))
- return -1;
- } else if (S_ISFIFO(st.st_mode))
+ } else if (S_ISCHR(st.st_mode) ||
+ S_ISFIFO(st.st_mode)) {
return -1;
- else {
+ } else {
majdev = major(st.st_dev);
mindev = minor(st.st_dev);
}
@@ -181,7 +177,7 @@ static int get_device_numbers(char *file_name, int *maj, int *min)
/*
* must be a file, open "." in that path
*/
- snprintf(tempname, ARRAY_SIZE(tempname), "%s", file_name);
+ snprintf(tempname, FIO_ARRAY_SIZE(tempname), "%s", file_name);
p = dirname(tempname);
if (stat(p, &st)) {
perror("disk util stat");
@@ -313,7 +309,7 @@ static struct disk_util *disk_util_add(struct thread_data *td, int majdev,
sfree(du);
return NULL;
}
- snprintf((char *) du->dus.name, ARRAY_SIZE(du->dus.name), "%s",
+ snprintf((char *) du->dus.name, FIO_ARRAY_SIZE(du->dus.name), "%s",
basename(path));
du->sysfs_root = strdup(path);
du->major = majdev;
@@ -435,7 +431,7 @@ static struct disk_util *__init_per_file_disk_util(struct thread_data *td,
log_err("unknown sysfs layout\n");
return NULL;
}
- snprintf(tmp, ARRAY_SIZE(tmp), "%s", p);
+ snprintf(tmp, FIO_ARRAY_SIZE(tmp), "%s", p);
sprintf(path, "%s", tmp);
}
diff --git a/engines/cpu.c b/engines/cpu.c
index 4d572b44..ce74dbce 100644
--- a/engines/cpu.c
+++ b/engines/cpu.c
@@ -8,11 +8,26 @@
#include "../fio.h"
#include "../optgroup.h"
+// number of 32 bit integers to sort
+size_t qsort_size = (256 * (1ULL << 10)); // 256KB
+
+struct mwc {
+ uint32_t w;
+ uint32_t z;
+};
+
+enum stress_mode {
+ FIO_CPU_NOOP = 0,
+ FIO_CPU_QSORT = 1,
+};
+
struct cpu_options {
void *pad;
unsigned int cpuload;
unsigned int cpucycle;
+ enum stress_mode cpumode;
unsigned int exit_io_done;
+ int32_t *qsort_data;
};
static struct fio_option options[] = {
@@ -26,6 +41,26 @@ static struct fio_option options[] = {
.group = FIO_OPT_G_INVALID,
},
{
+ .name = "cpumode",
+ .lname = "cpumode",
+ .type = FIO_OPT_STR,
+ .help = "Stress mode",
+ .off1 = offsetof(struct cpu_options, cpumode),
+ .def = "noop",
+ .posval = {
+ { .ival = "noop",
+ .oval = FIO_CPU_NOOP,
+ .help = "NOOP instructions",
+ },
+ { .ival = "qsort",
+ .oval = FIO_CPU_QSORT,
+ .help = "QSORT computation",
+ },
+ },
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
.name = "cpuchunks",
.lname = "CPU chunk",
.type = FIO_OPT_INT,
@@ -52,6 +87,91 @@ static struct fio_option options[] = {
},
};
+/*
+ * mwc32()
+ * Multiply-with-carry random numbers
+ * fast pseudo random number generator, see
+ * http://www.cse.yorku.ca/~oz/marsaglia-rng.html
+ */
+uint32_t mwc32(struct mwc *mwc)
+{
+ mwc->z = 36969 * (mwc->z & 65535) + (mwc->z >> 16);
+ mwc->w = 18000 * (mwc->w & 65535) + (mwc->w >> 16);
+ return (mwc->z << 16) + mwc->w;
+}
+
+/*
+ * stress_qsort_cmp_1()
+ * qsort comparison - sort on int32 values
+ */
+static int stress_qsort_cmp_1(const void *p1, const void *p2)
+{
+ const int32_t *i1 = (const int32_t *)p1;
+ const int32_t *i2 = (const int32_t *)p2;
+
+ if (*i1 > *i2)
+ return 1;
+ else if (*i1 < *i2)
+ return -1;
+ else
+ return 0;
+}
+
+/*
+ * stress_qsort_cmp_2()
+ * qsort comparison - reverse sort on int32 values
+ */
+static int stress_qsort_cmp_2(const void *p1, const void *p2)
+{
+ return stress_qsort_cmp_1(p2, p1);
+}
+
+/*
+ * stress_qsort_cmp_3()
+ * qsort comparison - sort on int8 values
+ */
+static int stress_qsort_cmp_3(const void *p1, const void *p2)
+{
+ const int8_t *i1 = (const int8_t *)p1;
+ const int8_t *i2 = (const int8_t *)p2;
+
+ /* Force re-ordering on 8 bit value */
+ return *i1 - *i2;
+}
+
+static int do_qsort(struct thread_data *td)
+{
+ struct thread_options *o = &td->o;
+ struct cpu_options *co = td->eo;
+ struct timespec start, now;
+
+ fio_get_mono_time(&start);
+
+ /* Sort "random" data */
+ qsort(co->qsort_data, qsort_size, sizeof(*(co->qsort_data)), stress_qsort_cmp_1);
+
+ /* Reverse sort */
+ qsort(co->qsort_data, qsort_size, sizeof(*(co->qsort_data)), stress_qsort_cmp_2);
+
+ /* And re-order by byte compare */
+ qsort((uint8_t *)co->qsort_data, qsort_size * 4, sizeof(uint8_t), stress_qsort_cmp_3);
+
+ /* Reverse sort this again */
+ qsort(co->qsort_data, qsort_size, sizeof(*(co->qsort_data)), stress_qsort_cmp_2);
+ fio_get_mono_time(&now);
+
+ /* Adjusting cpucycle automatically to be as close as possible to the
+ * expected cpuload The time to execute do_qsort() may change over time
+ * as per : - the job concurrency - the cpu clock adjusted by the power
+ * management After every do_qsort() call, the next thinktime is
+ * adjusted regarding the last run performance
+ */
+ co->cpucycle = utime_since(&start, &now);
+ o->thinktime = ((unsigned long long) co->cpucycle *
+ (100 - co->cpuload)) / co->cpuload;
+
+ return 0;
+}
static enum fio_q_status fio_cpuio_queue(struct thread_data *td,
struct io_u fio_unused *io_u)
@@ -63,14 +183,69 @@ static enum fio_q_status fio_cpuio_queue(struct thread_data *td,
return FIO_Q_BUSY;
}
- usec_spin(co->cpucycle);
+ switch (co->cpumode) {
+ case FIO_CPU_NOOP:
+ usec_spin(co->cpucycle);
+ break;
+ case FIO_CPU_QSORT:
+ do_qsort(td);
+ break;
+ }
+
return FIO_Q_COMPLETED;
}
+static int noop_init(struct thread_data *td)
+{
+ struct cpu_options *co = td->eo;
+
+ log_info("%s (noop): ioengine=%s, cpuload=%u, cpucycle=%u\n",
+ td->o.name, td->io_ops->name, co->cpuload, co->cpucycle);
+ return 0;
+}
+
+static int qsort_cleanup(struct thread_data *td)
+{
+ struct cpu_options *co = td->eo;
+
+ if (co->qsort_data) {
+ free(co->qsort_data);
+ co->qsort_data = NULL;
+ }
+
+ return 0;
+}
+
+static int qsort_init(struct thread_data *td)
+{
+ /* Setting up a default entropy */
+ struct mwc mwc = { 521288629UL, 362436069UL };
+ struct cpu_options *co = td->eo;
+ int32_t *ptr;
+ int i;
+
+ co->qsort_data = calloc(qsort_size, sizeof(*co->qsort_data));
+ if (co->qsort_data == NULL) {
+ td_verror(td, ENOMEM, "qsort_init");
+ return 1;
+ }
+
+ /* This is expensive, init the memory once */
+ for (ptr = co->qsort_data, i = 0; i < qsort_size; i++)
+ *ptr++ = mwc32(&mwc);
+
+ log_info("%s (qsort): ioengine=%s, cpuload=%u, cpucycle=%u\n",
+ td->o.name, td->io_ops->name, co->cpuload, co->cpucycle);
+
+ return 0;
+}
+
static int fio_cpuio_init(struct thread_data *td)
{
struct thread_options *o = &td->o;
struct cpu_options *co = td->eo;
+ int td_previous_state;
+ char *msg;
if (!co->cpuload) {
td_vmsg(td, EINVAL, "cpu thread needs rate (cpuload=)","cpuio");
@@ -80,21 +255,59 @@ static int fio_cpuio_init(struct thread_data *td)
if (co->cpuload > 100)
co->cpuload = 100;
+ /* Saving the current thread state */
+ td_previous_state = td->runstate;
+
+ /* Reporting that we are preparing the engine
+ * This is useful as the qsort() calibration takes time
+ * This prevents the job from starting before init is completed
+ */
+ td_set_runstate(td, TD_SETTING_UP);
+
/*
* set thinktime_sleep and thinktime_spin appropriately
*/
o->thinktime_blocks = 1;
+ o->thinktime_blocks_type = THINKTIME_BLOCKS_TYPE_COMPLETE;
o->thinktime_spin = 0;
- o->thinktime = ((unsigned long long) co->cpucycle * (100 - co->cpuload)) / co->cpuload;
+ o->thinktime = ((unsigned long long) co->cpucycle *
+ (100 - co->cpuload)) / co->cpuload;
o->nr_files = o->open_files = 1;
- log_info("%s: ioengine=%s, cpuload=%u, cpucycle=%u\n",
- td->o.name, td->io_ops->name, co->cpuload, co->cpucycle);
+ switch (co->cpumode) {
+ case FIO_CPU_NOOP:
+ noop_init(td);
+ break;
+ case FIO_CPU_QSORT:
+ qsort_init(td);
+ break;
+ default:
+ if (asprintf(&msg, "bad cpu engine mode: %d", co->cpumode) < 0)
+ msg = NULL;
+ td_vmsg(td, EINVAL, msg ? : "(?)", __func__);
+ free(msg);
+ return 1;
+ }
+ /* Let's restore the previous state. */
+ td_set_runstate(td, td_previous_state);
return 0;
}
+static void fio_cpuio_cleanup(struct thread_data *td)
+{
+ struct cpu_options *co = td->eo;
+
+ switch (co->cpumode) {
+ case FIO_CPU_NOOP:
+ break;
+ case FIO_CPU_QSORT:
+ qsort_cleanup(td);
+ break;
+ }
+}
+
static int fio_cpuio_open(struct thread_data fio_unused *td,
struct fio_file fio_unused *f)
{
@@ -102,12 +315,13 @@ static int fio_cpuio_open(struct thread_data fio_unused *td,
}
static struct ioengine_ops ioengine = {
- .name = "cpuio",
- .version = FIO_IOOPS_VERSION,
- .queue = fio_cpuio_queue,
- .init = fio_cpuio_init,
- .open_file = fio_cpuio_open,
- .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOIO,
+ .name = "cpuio",
+ .version = FIO_IOOPS_VERSION,
+ .queue = fio_cpuio_queue,
+ .init = fio_cpuio_init,
+ .cleanup = fio_cpuio_cleanup,
+ .open_file = fio_cpuio_open,
+ .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOIO,
.options = options,
.option_struct_size = sizeof(struct cpu_options),
};
diff --git a/engines/dfs.c b/engines/dfs.c
new file mode 100644
index 00000000..0343b101
--- /dev/null
+++ b/engines/dfs.c
@@ -0,0 +1,583 @@
+/**
+ * FIO engine for DAOS File System (dfs).
+ *
+ * (C) Copyright 2020-2021 Intel Corporation.
+ */
+
+#include <fio.h>
+#include <optgroup.h>
+
+#include <daos.h>
+#include <daos_fs.h>
+
+static bool daos_initialized;
+static int num_threads;
+static pthread_mutex_t daos_mutex = PTHREAD_MUTEX_INITIALIZER;
+daos_handle_t poh; /* pool handle */
+daos_handle_t coh; /* container handle */
+daos_oclass_id_t cid = OC_UNKNOWN; /* object class */
+dfs_t *dfs; /* dfs mount reference */
+
+struct daos_iou {
+ struct io_u *io_u;
+ daos_event_t ev;
+ d_sg_list_t sgl;
+ d_iov_t iov;
+ daos_size_t size;
+ bool complete;
+};
+
+struct daos_data {
+ daos_handle_t eqh;
+ dfs_obj_t *obj;
+ struct io_u **io_us;
+ int queued;
+ int num_ios;
+};
+
+struct daos_fio_options {
+ void *pad;
+ char *pool; /* Pool UUID */
+ char *cont; /* Container UUID */
+ daos_size_t chsz; /* Chunk size */
+ char *oclass; /* object class */
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+ char *svcl; /* service replica list, deprecated */
+#endif
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "pool",
+ .lname = "pool uuid",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct daos_fio_options, pool),
+ .help = "DAOS pool uuid",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+ {
+ .name = "cont",
+ .lname = "container uuid",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct daos_fio_options, cont),
+ .help = "DAOS container uuid",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+ {
+ .name = "chunk_size",
+ .lname = "DFS chunk size",
+ .type = FIO_OPT_ULL,
+ .off1 = offsetof(struct daos_fio_options, chsz),
+ .help = "DFS chunk size in bytes",
+ .def = "0", /* use container default */
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+ {
+ .name = "object_class",
+ .lname = "object class",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct daos_fio_options, oclass),
+ .help = "DAOS object class",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+ {
+ .name = "svcl",
+ .lname = "List of service ranks",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct daos_fio_options, svcl),
+ .help = "List of pool replicated service ranks",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+#endif
+ {
+ .name = NULL,
+ },
+};
+
+static int daos_fio_global_init(struct thread_data *td)
+{
+ struct daos_fio_options *eo = td->eo;
+ uuid_t pool_uuid, co_uuid;
+ daos_pool_info_t pool_info;
+ daos_cont_info_t co_info;
+ int rc = 0;
+
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+ if (!eo->pool || !eo->cont || !eo->svcl) {
+#else
+ if (!eo->pool || !eo->cont) {
+#endif
+ log_err("Missing required DAOS options\n");
+ return EINVAL;
+ }
+
+ rc = daos_init();
+ if (rc != -DER_ALREADY && rc) {
+ log_err("Failed to initialize daos %d\n", rc);
+ td_verror(td, rc, "daos_init");
+ return rc;
+ }
+
+ rc = uuid_parse(eo->pool, pool_uuid);
+ if (rc) {
+ log_err("Failed to parse 'Pool uuid': %s\n", eo->pool);
+ td_verror(td, EINVAL, "uuid_parse(eo->pool)");
+ return EINVAL;
+ }
+
+ rc = uuid_parse(eo->cont, co_uuid);
+ if (rc) {
+ log_err("Failed to parse 'Cont uuid': %s\n", eo->cont);
+ td_verror(td, EINVAL, "uuid_parse(eo->cont)");
+ return EINVAL;
+ }
+
+ /* Connect to the DAOS pool */
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+ d_rank_list_t *svcl = NULL;
+
+ svcl = daos_rank_list_parse(eo->svcl, ":");
+ if (svcl == NULL) {
+ log_err("Failed to parse svcl\n");
+ td_verror(td, EINVAL, "daos_rank_list_parse");
+ return EINVAL;
+ }
+
+ rc = daos_pool_connect(pool_uuid, NULL, svcl, DAOS_PC_RW,
+ &poh, &pool_info, NULL);
+ d_rank_list_free(svcl);
+#else
+ rc = daos_pool_connect(pool_uuid, NULL, DAOS_PC_RW, &poh, &pool_info,
+ NULL);
+#endif
+ if (rc) {
+ log_err("Failed to connect to pool %d\n", rc);
+ td_verror(td, rc, "daos_pool_connect");
+ return rc;
+ }
+
+ /* Open the DAOS container */
+ rc = daos_cont_open(poh, co_uuid, DAOS_COO_RW, &coh, &co_info, NULL);
+ if (rc) {
+ log_err("Failed to open container: %d\n", rc);
+ td_verror(td, rc, "daos_cont_open");
+ (void)daos_pool_disconnect(poh, NULL);
+ return rc;
+ }
+
+ /* Mount encapsulated filesystem */
+ rc = dfs_mount(poh, coh, O_RDWR, &dfs);
+ if (rc) {
+ log_err("Failed to mount DFS namespace: %d\n", rc);
+ td_verror(td, rc, "dfs_mount");
+ (void)daos_pool_disconnect(poh, NULL);
+ (void)daos_cont_close(coh, NULL);
+ return rc;
+ }
+
+ /* Retrieve object class to use, if specified */
+ if (eo->oclass)
+ cid = daos_oclass_name2id(eo->oclass);
+
+ return 0;
+}
+
+static int daos_fio_global_cleanup()
+{
+ int rc;
+ int ret = 0;
+
+ rc = dfs_umount(dfs);
+ if (rc) {
+ log_err("failed to umount dfs: %d\n", rc);
+ ret = rc;
+ }
+ rc = daos_cont_close(coh, NULL);
+ if (rc) {
+ log_err("failed to close container: %d\n", rc);
+ if (ret == 0)
+ ret = rc;
+ }
+ rc = daos_pool_disconnect(poh, NULL);
+ if (rc) {
+ log_err("failed to disconnect pool: %d\n", rc);
+ if (ret == 0)
+ ret = rc;
+ }
+ rc = daos_fini();
+ if (rc) {
+ log_err("failed to finalize daos: %d\n", rc);
+ if (ret == 0)
+ ret = rc;
+ }
+
+ return ret;
+}
+
+static int daos_fio_setup(struct thread_data *td)
+{
+ return 0;
+}
+
+static int daos_fio_init(struct thread_data *td)
+{
+ struct daos_data *dd;
+ int rc = 0;
+
+ pthread_mutex_lock(&daos_mutex);
+
+ dd = malloc(sizeof(*dd));
+ if (dd == NULL) {
+ log_err("Failed to allocate DAOS-private data\n");
+ rc = ENOMEM;
+ goto out;
+ }
+
+ dd->queued = 0;
+ dd->num_ios = td->o.iodepth;
+ dd->io_us = calloc(dd->num_ios, sizeof(struct io_u *));
+ if (dd->io_us == NULL) {
+ log_err("Failed to allocate IO queue\n");
+ rc = ENOMEM;
+ goto out;
+ }
+
+ /* initialize DAOS stack if not already up */
+ if (!daos_initialized) {
+ rc = daos_fio_global_init(td);
+ if (rc)
+ goto out;
+ daos_initialized = true;
+ }
+
+ rc = daos_eq_create(&dd->eqh);
+ if (rc) {
+ log_err("Failed to create event queue: %d\n", rc);
+ td_verror(td, rc, "daos_eq_create");
+ goto out;
+ }
+
+ td->io_ops_data = dd;
+ num_threads++;
+out:
+ if (rc) {
+ if (dd) {
+ free(dd->io_us);
+ free(dd);
+ }
+ if (num_threads == 0 && daos_initialized) {
+ /* don't clobber error return value */
+ (void)daos_fio_global_cleanup();
+ daos_initialized = false;
+ }
+ }
+ pthread_mutex_unlock(&daos_mutex);
+ return rc;
+}
+
+static void daos_fio_cleanup(struct thread_data *td)
+{
+ struct daos_data *dd = td->io_ops_data;
+ int rc;
+
+ if (dd == NULL)
+ return;
+
+ rc = daos_eq_destroy(dd->eqh, DAOS_EQ_DESTROY_FORCE);
+ if (rc < 0) {
+ log_err("failed to destroy event queue: %d\n", rc);
+ td_verror(td, rc, "daos_eq_destroy");
+ }
+
+ free(dd->io_us);
+ free(dd);
+
+ pthread_mutex_lock(&daos_mutex);
+ num_threads--;
+ if (daos_initialized && num_threads == 0) {
+ int ret;
+
+ ret = daos_fio_global_cleanup();
+ if (ret < 0 && rc == 0) {
+ log_err("failed to clean up: %d\n", ret);
+ td_verror(td, ret, "daos_fio_global_cleanup");
+ }
+ daos_initialized = false;
+ }
+ pthread_mutex_unlock(&daos_mutex);
+}
+
+static int daos_fio_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+ char *file_name = f->file_name;
+ struct stat stbuf = {0};
+ int rc;
+
+ dprint(FD_FILE, "dfs stat %s\n", f->file_name);
+
+ if (!daos_initialized)
+ return 0;
+
+ rc = dfs_stat(dfs, NULL, file_name, &stbuf);
+ if (rc) {
+ log_err("Failed to stat %s: %d\n", f->file_name, rc);
+ td_verror(td, rc, "dfs_stat");
+ return rc;
+ }
+
+ f->real_file_size = stbuf.st_size;
+ return 0;
+}
+
+static int daos_fio_close(struct thread_data *td, struct fio_file *f)
+{
+ struct daos_data *dd = td->io_ops_data;
+ int rc;
+
+ dprint(FD_FILE, "dfs release %s\n", f->file_name);
+
+ rc = dfs_release(dd->obj);
+ if (rc) {
+ log_err("Failed to release %s: %d\n", f->file_name, rc);
+ td_verror(td, rc, "dfs_release");
+ return rc;
+ }
+
+ return 0;
+}
+
+static int daos_fio_open(struct thread_data *td, struct fio_file *f)
+{
+ struct daos_data *dd = td->io_ops_data;
+ struct daos_fio_options *eo = td->eo;
+ int flags = 0;
+ int rc;
+
+ dprint(FD_FILE, "dfs open %s (%s/%d/%d)\n",
+ f->file_name, td_write(td) & !read_only ? "rw" : "r",
+ td->o.create_on_open, td->o.allow_create);
+
+ if (td->o.create_on_open && td->o.allow_create)
+ flags |= O_CREAT;
+
+ if (td_write(td)) {
+ if (!read_only)
+ flags |= O_RDWR;
+ if (td->o.allow_create)
+ flags |= O_CREAT;
+ } else if (td_read(td)) {
+ flags |= O_RDONLY;
+ }
+
+ rc = dfs_open(dfs, NULL, f->file_name,
+ S_IFREG | S_IRUSR | S_IWUSR,
+ flags, cid, eo->chsz, NULL, &dd->obj);
+ if (rc) {
+ log_err("Failed to open %s: %d\n", f->file_name, rc);
+ td_verror(td, rc, "dfs_open");
+ return rc;
+ }
+
+ return 0;
+}
+
+static int daos_fio_unlink(struct thread_data *td, struct fio_file *f)
+{
+ int rc;
+
+ dprint(FD_FILE, "dfs remove %s\n", f->file_name);
+
+ rc = dfs_remove(dfs, NULL, f->file_name, false, NULL);
+ if (rc) {
+ log_err("Failed to remove %s: %d\n", f->file_name, rc);
+ td_verror(td, rc, "dfs_remove");
+ return rc;
+ }
+
+ return 0;
+}
+
+static int daos_fio_invalidate(struct thread_data *td, struct fio_file *f)
+{
+ dprint(FD_FILE, "dfs invalidate %s\n", f->file_name);
+ return 0;
+}
+
+static void daos_fio_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+ struct daos_iou *io = io_u->engine_data;
+
+ if (io) {
+ io_u->engine_data = NULL;
+ free(io);
+ }
+}
+
+static int daos_fio_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+ struct daos_iou *io;
+
+ io = malloc(sizeof(struct daos_iou));
+ if (!io) {
+ td_verror(td, ENOMEM, "malloc");
+ return ENOMEM;
+ }
+ io->io_u = io_u;
+ io_u->engine_data = io;
+ return 0;
+}
+
+static struct io_u * daos_fio_event(struct thread_data *td, int event)
+{
+ struct daos_data *dd = td->io_ops_data;
+
+ return dd->io_us[event];
+}
+
+static int daos_fio_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ struct daos_data *dd = td->io_ops_data;
+ daos_event_t *evp[max];
+ unsigned int events = 0;
+ int i;
+ int rc;
+
+ while (events < min) {
+ rc = daos_eq_poll(dd->eqh, 0, DAOS_EQ_NOWAIT, max, evp);
+ if (rc < 0) {
+ log_err("Event poll failed: %d\n", rc);
+ td_verror(td, rc, "daos_eq_poll");
+ return events;
+ }
+
+ for (i = 0; i < rc; i++) {
+ struct daos_iou *io;
+ struct io_u *io_u;
+
+ io = container_of(evp[i], struct daos_iou, ev);
+ if (io->complete)
+ log_err("Completion on already completed I/O\n");
+
+ io_u = io->io_u;
+ if (io->ev.ev_error)
+ io_u->error = io->ev.ev_error;
+ else
+ io_u->resid = 0;
+
+ dd->io_us[events] = io_u;
+ dd->queued--;
+ daos_event_fini(&io->ev);
+ io->complete = true;
+ events++;
+ }
+ }
+
+ dprint(FD_IO, "dfs eq_pool returning %d (%u/%u)\n", events, min, max);
+
+ return events;
+}
+
+static enum fio_q_status daos_fio_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct daos_data *dd = td->io_ops_data;
+ struct daos_iou *io = io_u->engine_data;
+ daos_off_t offset = io_u->offset;
+ int rc;
+
+ if (dd->queued == td->o.iodepth)
+ return FIO_Q_BUSY;
+
+ io->sgl.sg_nr = 1;
+ io->sgl.sg_nr_out = 0;
+ d_iov_set(&io->iov, io_u->xfer_buf, io_u->xfer_buflen);
+ io->sgl.sg_iovs = &io->iov;
+ io->size = io_u->xfer_buflen;
+
+ io->complete = false;
+ rc = daos_event_init(&io->ev, dd->eqh, NULL);
+ if (rc) {
+ log_err("Event init failed: %d\n", rc);
+ io_u->error = rc;
+ return FIO_Q_COMPLETED;
+ }
+
+ switch (io_u->ddir) {
+ case DDIR_WRITE:
+ rc = dfs_write(dfs, dd->obj, &io->sgl, offset, &io->ev);
+ if (rc) {
+ log_err("dfs_write failed: %d\n", rc);
+ io_u->error = rc;
+ return FIO_Q_COMPLETED;
+ }
+ break;
+ case DDIR_READ:
+ rc = dfs_read(dfs, dd->obj, &io->sgl, offset, &io->size,
+ &io->ev);
+ if (rc) {
+ log_err("dfs_read failed: %d\n", rc);
+ io_u->error = rc;
+ return FIO_Q_COMPLETED;
+ }
+ break;
+ case DDIR_SYNC:
+ io_u->error = 0;
+ return FIO_Q_COMPLETED;
+ default:
+ dprint(FD_IO, "Invalid IO type: %d\n", io_u->ddir);
+ io_u->error = -DER_INVAL;
+ return FIO_Q_COMPLETED;
+ }
+
+ dd->queued++;
+ return FIO_Q_QUEUED;
+}
+
+static int daos_fio_prep(struct thread_data fio_unused *td, struct io_u *io_u)
+{
+ return 0;
+}
+
+/* ioengine_ops for get_ioengine() */
+FIO_STATIC struct ioengine_ops ioengine = {
+ .name = "dfs",
+ .version = FIO_IOOPS_VERSION,
+ .flags = FIO_DISKLESSIO | FIO_NODISKUTIL,
+
+ .setup = daos_fio_setup,
+ .init = daos_fio_init,
+ .prep = daos_fio_prep,
+ .cleanup = daos_fio_cleanup,
+
+ .open_file = daos_fio_open,
+ .invalidate = daos_fio_invalidate,
+ .get_file_size = daos_fio_get_file_size,
+ .close_file = daos_fio_close,
+ .unlink_file = daos_fio_unlink,
+
+ .queue = daos_fio_queue,
+ .getevents = daos_fio_getevents,
+ .event = daos_fio_event,
+ .io_u_init = daos_fio_io_u_init,
+ .io_u_free = daos_fio_io_u_free,
+
+ .option_struct_size = sizeof(struct daos_fio_options),
+ .options = options,
+};
+
+static void fio_init fio_dfs_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_dfs_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
diff --git a/engines/exec.c b/engines/exec.c
new file mode 100644
index 00000000..ab3639c5
--- /dev/null
+++ b/engines/exec.c
@@ -0,0 +1,394 @@
+/*
+ * Exec engine
+ *
+ * Doesn't transfer any data, merely run 3rd party tools
+ *
+ */
+#include "../fio.h"
+#include "../optgroup.h"
+#include <signal.h>
+
+struct exec_options {
+ void *pad;
+ char *program;
+ char *arguments;
+ int grace_time;
+ unsigned int std_redirect;
+ pid_t pid;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "program",
+ .lname = "Program",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct exec_options, program),
+ .help = "Program to execute",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "arguments",
+ .lname = "Arguments",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct exec_options, arguments),
+ .help = "Arguments to pass",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "grace_time",
+ .lname = "Grace time",
+ .type = FIO_OPT_INT,
+ .minval = 0,
+ .def = "1",
+ .off1 = offsetof(struct exec_options, grace_time),
+ .help = "Grace time before sending a SIGKILL",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "std_redirect",
+ .lname = "Std redirect",
+ .type = FIO_OPT_BOOL,
+ .def = "1",
+ .off1 = offsetof(struct exec_options, std_redirect),
+ .help = "Redirect stdout & stderr to files",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+char *str_replace(char *orig, const char *rep, const char *with)
+{
+ /*
+ * Replace a substring by another.
+ *
+ * Returns the new string if occurences were found
+ * Returns orig if no occurence is found
+ */
+ char *result, *insert, *tmp;
+ int len_rep, len_with, len_front, count;
+
+ /* sanity checks and initialization */
+ if (!orig || !rep)
+ return orig;
+
+ len_rep = strlen(rep);
+ if (len_rep == 0)
+ return orig;
+
+ if (!with)
+ with = "";
+ len_with = strlen(with);
+
+ insert = orig;
+ for (count = 0; (tmp = strstr(insert, rep)); ++count) {
+ insert = tmp + len_rep;
+ }
+
+ tmp = result = malloc(strlen(orig) + (len_with - len_rep) * count + 1);
+
+ if (!result)
+ return orig;
+
+ while (count--) {
+ insert = strstr(orig, rep);
+ len_front = insert - orig;
+ tmp = strncpy(tmp, orig, len_front) + len_front;
+ tmp = strcpy(tmp, with) + len_with;
+ orig += len_front + len_rep;
+ }
+ strcpy(tmp, orig);
+ return result;
+}
+
+char *expand_variables(struct thread_options *o, char *arguments)
+{
+ char str[16];
+ char *expanded_runtime, *expanded_name;
+ snprintf(str, sizeof(str), "%lld", o->timeout / 1000000);
+
+ /* %r is replaced by the runtime in seconds */
+ expanded_runtime = str_replace(arguments, "%r", str);
+
+ /* %n is replaced by the name of the running job */
+ expanded_name = str_replace(expanded_runtime, "%n", o->name);
+
+ free(expanded_runtime);
+ return expanded_name;
+}
+
+static int exec_background(struct thread_options *o, struct exec_options *eo)
+{
+ char *outfilename = NULL, *errfilename = NULL;
+ int outfd = 0, errfd = 0;
+ pid_t pid;
+ char *expanded_arguments = NULL;
+ /* For the arguments splitting */
+ char **arguments_array = NULL;
+ char *p;
+ char *exec_cmd = NULL;
+ size_t arguments_nb_items = 0, q;
+
+ if (asprintf(&outfilename, "%s.stdout", o->name) < 0)
+ return -1;
+
+ if (asprintf(&errfilename, "%s.stderr", o->name) < 0) {
+ free(outfilename);
+ return -1;
+ }
+
+ /* If we have variables in the arguments, let's expand them */
+ expanded_arguments = expand_variables(o, eo->arguments);
+
+ if (eo->std_redirect) {
+ log_info("%s : Saving output of %s %s : stdout=%s stderr=%s\n",
+ o->name, eo->program, expanded_arguments, outfilename,
+ errfilename);
+
+ /* Creating the stderr & stdout output files */
+ outfd = open(outfilename, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+ if (outfd < 0) {
+ log_err("fio: cannot open output file %s : %s\n",
+ outfilename, strerror(errno));
+ free(outfilename);
+ free(errfilename);
+ free(expanded_arguments);
+ return -1;
+ }
+
+ errfd = open(errfilename, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+ if (errfd < 0) {
+ log_err("fio: cannot open output file %s : %s\n",
+ errfilename, strerror(errno));
+ free(outfilename);
+ free(errfilename);
+ free(expanded_arguments);
+ close(outfd);
+ return -1;
+ }
+ } else {
+ log_info("%s : Running %s %s\n",
+ o->name, eo->program, expanded_arguments);
+ }
+
+ pid = fork();
+
+ /* We are on the control thread (parent side of the fork */
+ if (pid > 0) {
+ eo->pid = pid;
+ if (eo->std_redirect) {
+ /* The output file is for the client side of the fork */
+ close(outfd);
+ close(errfd);
+ free(outfilename);
+ free(errfilename);
+ }
+ free(expanded_arguments);
+ return 0;
+ }
+
+ /* If the fork failed */
+ if (pid < 0) {
+ log_err("fio: forking failed %s \n", strerror(errno));
+ if (eo->std_redirect) {
+ close(outfd);
+ close(errfd);
+ free(outfilename);
+ free(errfilename);
+ }
+ free(expanded_arguments);
+ return -1;
+ }
+
+ /* We are in the worker (child side of the fork) */
+ if (pid == 0) {
+ if (eo->std_redirect) {
+ /* replace stdout by the output file we create */
+ dup2(outfd, 1);
+ /* replace stderr by the output file we create */
+ dup2(errfd, 2);
+ close(outfd);
+ close(errfd);
+ free(outfilename);
+ free(errfilename);
+ }
+
+ /*
+ * Let's split the command line into a null terminated array to
+ * be passed to the exec'd program.
+ * But don't asprintf expanded_arguments if NULL as it would be
+ * converted to a '(null)' argument, while we want no arguments
+ * at all.
+ */
+ if (expanded_arguments != NULL) {
+ if (asprintf(&exec_cmd, "%s %s", eo->program, expanded_arguments) < 0) {
+ free(expanded_arguments);
+ return -1;
+ }
+ } else {
+ if (asprintf(&exec_cmd, "%s", eo->program) < 0)
+ return -1;
+ }
+
+ /*
+ * Let's build an argv array to based on the program name and
+ * arguments
+ */
+ p = exec_cmd;
+ for (;;) {
+ p += strspn(p, " ");
+
+ if (!(q = strcspn(p, " ")))
+ break;
+
+ if (q) {
+ arguments_array =
+ realloc(arguments_array,
+ (arguments_nb_items +
+ 1) * sizeof(char *));
+ arguments_array[arguments_nb_items] =
+ malloc(q + 1);
+ strncpy(arguments_array[arguments_nb_items], p,
+ q);
+ arguments_array[arguments_nb_items][q] = 0;
+ arguments_nb_items++;
+ p += q;
+ }
+ }
+
+ /* Adding a null-terminated item to close the list */
+ arguments_array =
+ realloc(arguments_array,
+ (arguments_nb_items + 1) * sizeof(char *));
+ arguments_array[arguments_nb_items] = NULL;
+
+ /*
+ * Replace the fio program from the child fork by the target
+ * program
+ */
+ execvp(arguments_array[0], arguments_array);
+ }
+ /* We never reach this place */
+ /* Let's free the malloc'ed structures to make static checkers happy */
+ if (expanded_arguments)
+ free(expanded_arguments);
+ if (arguments_array)
+ free(arguments_array);
+ return 0;
+}
+
+static enum fio_q_status
+fio_exec_queue(struct thread_data *td, struct io_u fio_unused * io_u)
+{
+ struct thread_options *o = &td->o;
+ struct exec_options *eo = td->eo;
+
+ /* Let's execute the program the first time we get queued */
+ if (eo->pid == -1) {
+ exec_background(o, eo);
+ } else {
+ /*
+ * The program is running in background, let's check on a
+ * regular basis
+ * if the time is over and if we need to stop the tool
+ */
+ usleep(o->thinktime);
+ if (utime_since_now(&td->start) > o->timeout) {
+ /* Let's stop the child */
+ kill(eo->pid, SIGTERM);
+ /*
+ * Let's give grace_time (1 sec by default) to the 3rd
+ * party tool to stop
+ */
+ sleep(eo->grace_time);
+ }
+ }
+
+ return FIO_Q_COMPLETED;
+}
+
+static int fio_exec_init(struct thread_data *td)
+{
+ struct thread_options *o = &td->o;
+ struct exec_options *eo = td->eo;
+ int td_previous_state;
+
+ eo->pid = -1;
+
+ if (!eo->program) {
+ td_vmsg(td, EINVAL,
+ "no program is defined, it is mandatory to define one",
+ "exec");
+ return 1;
+ }
+
+ log_info("%s : program=%s, arguments=%s\n",
+ td->o.name, eo->program, eo->arguments);
+
+ /* Saving the current thread state */
+ td_previous_state = td->runstate;
+
+ /*
+ * Reporting that we are preparing the engine
+ * This is useful as the qsort() calibration takes time
+ * This prevents the job from starting before init is completed
+ */
+ td_set_runstate(td, TD_SETTING_UP);
+
+ /*
+ * set thinktime_sleep and thinktime_spin appropriately
+ */
+ o->thinktime_blocks = 1;
+ o->thinktime_blocks_type = THINKTIME_BLOCKS_TYPE_COMPLETE;
+ o->thinktime_spin = 0;
+ /* 50ms pause when waiting for the program to complete */
+ o->thinktime = 50000;
+
+ o->nr_files = o->open_files = 1;
+
+ /* Let's restore the previous state. */
+ td_set_runstate(td, td_previous_state);
+ return 0;
+}
+
+static void fio_exec_cleanup(struct thread_data *td)
+{
+ struct exec_options *eo = td->eo;
+ /* Send a sigkill to ensure the job is well terminated */
+ if (eo->pid > 0)
+ kill(eo->pid, SIGKILL);
+}
+
+static int
+fio_exec_open(struct thread_data fio_unused * td,
+ struct fio_file fio_unused * f)
+{
+ return 0;
+}
+
+static struct ioengine_ops ioengine = {
+ .name = "exec",
+ .version = FIO_IOOPS_VERSION,
+ .queue = fio_exec_queue,
+ .init = fio_exec_init,
+ .cleanup = fio_exec_cleanup,
+ .open_file = fio_exec_open,
+ .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOIO,
+ .options = options,
+ .option_struct_size = sizeof(struct exec_options),
+};
+
+static void fio_init fio_exec_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_exec_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
diff --git a/engines/falloc.c b/engines/falloc.c
index 6382569b..4b05ed68 100644
--- a/engines/falloc.c
+++ b/engines/falloc.c
@@ -25,8 +25,8 @@ static int open_file(struct thread_data *td, struct fio_file *f)
dprint(FD_FILE, "fd open %s\n", f->file_name);
- if (f->filetype != FIO_TYPE_FILE) {
- log_err("fio: only files are supported fallocate \n");
+ if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK) {
+ log_err("fio: only files and blockdev are supported fallocate \n");
return 1;
}
if (!strcmp(f->file_name, "-")) {
diff --git a/engines/filecreate.c b/engines/filecreate.c
index 5fec8544..16c64928 100644
--- a/engines/filecreate.c
+++ b/engines/filecreate.c
@@ -22,7 +22,7 @@ static int open_file(struct thread_data *td, struct fio_file *f)
dprint(FD_FILE, "fd open %s\n", f->file_name);
if (f->filetype != FIO_TYPE_FILE) {
- log_err("fio: only files are supported fallocate \n");
+ log_err("fio: only files are supported\n");
return 1;
}
if (!strcmp(f->file_name, "-")) {
diff --git a/engines/filedelete.c b/engines/filedelete.c
new file mode 100644
index 00000000..64c58639
--- /dev/null
+++ b/engines/filedelete.c
@@ -0,0 +1,115 @@
+/*
+ * file delete engine
+ *
+ * IO engine that doesn't do any IO, just delete files and track the latency
+ * of the file deletion.
+ */
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "../fio.h"
+
+struct fc_data {
+ enum fio_ddir stat_ddir;
+};
+
+static int delete_file(struct thread_data *td, struct fio_file *f)
+{
+ struct timespec start;
+ int do_lat = !td->o.disable_lat;
+ int ret;
+
+ dprint(FD_FILE, "fd delete %s\n", f->file_name);
+
+ if (f->filetype != FIO_TYPE_FILE) {
+ log_err("fio: only files are supported\n");
+ return 1;
+ }
+ if (!strcmp(f->file_name, "-")) {
+ log_err("fio: can't read/write to stdin/out\n");
+ return 1;
+ }
+
+ if (do_lat)
+ fio_gettime(&start, NULL);
+
+ ret = unlink(f->file_name);
+
+ if (ret == -1) {
+ char buf[FIO_VERROR_SIZE];
+ int e = errno;
+
+ snprintf(buf, sizeof(buf), "delete(%s)", f->file_name);
+ td_verror(td, e, buf);
+ return 1;
+ }
+
+ if (do_lat) {
+ struct fc_data *data = td->io_ops_data;
+ uint64_t nsec;
+
+ nsec = ntime_since_now(&start);
+ add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
+ }
+
+ return 0;
+}
+
+
+static enum fio_q_status queue_io(struct thread_data *td, struct io_u fio_unused *io_u)
+{
+ return FIO_Q_COMPLETED;
+}
+
+static int init(struct thread_data *td)
+{
+ struct fc_data *data;
+
+ data = calloc(1, sizeof(*data));
+
+ if (td_read(td))
+ data->stat_ddir = DDIR_READ;
+ else if (td_write(td))
+ data->stat_ddir = DDIR_WRITE;
+
+ td->io_ops_data = data;
+ return 0;
+}
+
+static int delete_invalidate(struct thread_data *td, struct fio_file *f)
+{
+ /* do nothing because file not opened */
+ return 0;
+}
+
+static void cleanup(struct thread_data *td)
+{
+ struct fc_data *data = td->io_ops_data;
+
+ free(data);
+}
+
+static struct ioengine_ops ioengine = {
+ .name = "filedelete",
+ .version = FIO_IOOPS_VERSION,
+ .init = init,
+ .invalidate = delete_invalidate,
+ .cleanup = cleanup,
+ .queue = queue_io,
+ .get_file_size = generic_get_file_size,
+ .open_file = delete_file,
+ .flags = FIO_SYNCIO | FIO_FAKEIO |
+ FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static void fio_init fio_filedelete_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_filedelete_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
diff --git a/engines/glusterfs.c b/engines/glusterfs.c
index f2b84a2a..fc6fee19 100644
--- a/engines/glusterfs.c
+++ b/engines/glusterfs.c
@@ -271,8 +271,7 @@ int fio_gf_open_file(struct thread_data *td, struct fio_file *f)
if (td->o.odirect)
flags |= OS_O_DIRECT;
- if (td->o.sync_io)
- flags |= O_SYNC;
+ flags |= td->o.sync_io;
dprint(FD_FILE, "fio file %s open mode %s td rw %s\n", f->file_name,
flags & O_RDONLY ? "ro" : "rw", td_read(td) ? "read" : "write");
diff --git a/engines/ime.c b/engines/ime.c
index 42984021..440cc29e 100644
--- a/engines/ime.c
+++ b/engines/ime.c
@@ -194,8 +194,7 @@ static int fio_ime_open_file(struct thread_data *td, struct fio_file *f)
}
if (td->o.odirect)
flags |= O_DIRECT;
- if (td->o.sync_io)
- flags |= O_SYNC;
+ flags |= td->o.sync_io;
if (td->o.create_on_open && td->o.allow_create)
flags |= O_CREAT;
diff --git a/engines/io_uring.c b/engines/io_uring.c
index 69f48859..9c091e37 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -507,7 +507,7 @@ static void fio_ioring_unmap(struct ioring_data *ld)
{
int i;
- for (i = 0; i < ARRAY_SIZE(ld->mmap); i++)
+ for (i = 0; i < FIO_ARRAY_SIZE(ld->mmap); i++)
munmap(ld->mmap[i].ptr, ld->mmap[i].len);
close(ld->ring_fd);
}
@@ -696,7 +696,11 @@ static int fio_ioring_post_init(struct thread_data *td)
err = fio_ioring_queue_init(td);
if (err) {
- td_verror(td, errno, "io_queue_init");
+ int init_err = errno;
+
+ if (init_err == ENOSYS)
+ log_err("fio: your kernel doesn't support io_uring\n");
+ td_verror(td, init_err, "io_queue_init");
return 1;
}
@@ -724,12 +728,6 @@ static int fio_ioring_init(struct thread_data *td)
struct ioring_data *ld;
struct thread_options *to = &td->o;
- if (to->io_submit_mode == IO_MODE_OFFLOAD) {
- log_err("fio: io_submit_mode=offload is not compatible (or "
- "useful) with io_uring\n");
- return 1;
- }
-
/* sqthread submission requires registered files */
if (o->sqpoll_thread)
o->registerfiles = 1;
@@ -806,7 +804,7 @@ static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
static struct ioengine_ops ioengine = {
.name = "io_uring",
.version = FIO_IOOPS_VERSION,
- .flags = FIO_ASYNCIO_SYNC_TRIM,
+ .flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD,
.init = fio_ioring_init,
.post_init = fio_ioring_post_init,
.io_u_init = fio_ioring_io_u_init,
diff --git a/engines/libcufile.c b/engines/libcufile.c
new file mode 100644
index 00000000..e575b786
--- /dev/null
+++ b/engines/libcufile.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
+ * mailto:info@systemfabricworks.com
+ *
+ * License: GPLv2, see COPYING.
+ *
+ * libcufile engine
+ *
+ * fio I/O engine using the NVIDIA cuFile API.
+ *
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <cufile.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <pthread.h>
+
+#include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
+#include "../lib/memalign.h"
+
+#define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
+
+#define LOGGED_BUFLEN_NOT_ALIGNED 0x01
+#define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
+#define GPU_ID_SEP ":"
+
+enum {
+ IO_CUFILE = 1,
+ IO_POSIX = 2
+};
+
+struct libcufile_options {
+ struct thread_data *td;
+ char *gpu_ids; /* colon-separated list of GPU ids,
+ one per job */
+ void *cu_mem_ptr; /* GPU memory */
+ void *junk_buf; /* buffer to simulate cudaMemcpy with
+ posix I/O write */
+ int my_gpu_id; /* GPU id to use for this job */
+ unsigned int cuda_io; /* Type of I/O to use with CUDA */
+ size_t total_mem; /* size for cu_mem_ptr and junk_buf */
+ int logged; /* bitmask of log messages that have
+ been output, prevent flood */
+};
+
+struct fio_libcufile_data {
+ CUfileDescr_t cf_descr;
+ CUfileHandle_t cf_handle;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "gpu_dev_ids",
+ .lname = "libcufile engine gpu dev ids",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct libcufile_options, gpu_ids),
+ .help = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBCUFILE,
+ },
+ {
+ .name = "cuda_io",
+ .lname = "libcufile cuda io",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct libcufile_options, cuda_io),
+ .help = "Type of I/O to use with CUDA",
+ .def = "cufile",
+ .posval = {
+ { .ival = "cufile",
+ .oval = IO_CUFILE,
+ .help = "libcufile nvidia-fs"
+ },
+ { .ival = "posix",
+ .oval = IO_POSIX,
+ .help = "POSIX I/O"
+ }
+ },
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBCUFILE,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+static int running = 0;
+static int cufile_initialized = 0;
+static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
+
+#define check_cudaruntimecall(fn, rc) \
+ do { \
+ cudaError_t res = fn; \
+ if (res != cudaSuccess) { \
+ const char *str = cudaGetErrorName(res); \
+ log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
+ #fn, __LINE__, res, str); \
+ rc = -1; \
+ } else \
+ rc = 0; \
+ } while(0)
+
+static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
+{
+ if (IS_CUFILE_ERR(st.err))
+ return cufileop_status_error(st.err);
+ return "unknown";
+}
+
+/*
+ * Assign GPU to subjob roundrobin, similar to how multiple
+ * entries in 'directory' are handled by fio.
+ */
+static int fio_libcufile_find_gpu_id(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+ int gpu_id = 0;
+
+ if (o->gpu_ids != NULL) {
+ char *gpu_ids, *pos, *cur;
+ int i, id_count, gpu_idx;
+
+ for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
+ cur = strchr(cur, GPU_ID_SEP[0]);
+ if (cur != NULL)
+ cur++;
+ }
+
+ gpu_idx = td->subjob_number % id_count;
+
+ pos = gpu_ids = strdup(o->gpu_ids);
+ if (gpu_ids == NULL) {
+ log_err("strdup(gpu_ids): err=%d\n", errno);
+ return -1;
+ }
+
+ i = 0;
+ while (pos != NULL && i <= gpu_idx) {
+ i++;
+ cur = strsep(&pos, GPU_ID_SEP);
+ }
+
+ if (cur)
+ gpu_id = atoi(cur);
+
+ free(gpu_ids);
+ }
+
+ return gpu_id;
+}
+
+static int fio_libcufile_init(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+ CUfileError_t status;
+ int initialized;
+ int rc;
+
+ pthread_mutex_lock(&running_lock);
+ if (running == 0) {
+ assert(cufile_initialized == 0);
+ if (o->cuda_io == IO_CUFILE) {
+ /* only open the driver if this is the first worker thread */
+ status = cuFileDriverOpen();
+ if (status.err != CU_FILE_SUCCESS)
+ log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
+ fio_libcufile_get_cuda_error(status));
+ else
+ cufile_initialized = 1;
+ }
+ }
+ running++;
+ initialized = cufile_initialized;
+ pthread_mutex_unlock(&running_lock);
+
+ if (o->cuda_io == IO_CUFILE && !initialized)
+ return 1;
+
+ o->my_gpu_id = fio_libcufile_find_gpu_id(td);
+ if (o->my_gpu_id < 0)
+ return 1;
+
+ dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
+ check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
+ if (rc != 0)
+ return 1;
+
+ return 0;
+}
+
+static inline int fio_libcufile_pre_write(struct thread_data *td,
+ struct libcufile_options *o,
+ struct io_u *io_u,
+ size_t gpu_offset)
+{
+ int rc = 0;
+
+ if (o->cuda_io == IO_CUFILE) {
+ if (td->o.verify) {
+ /*
+ Data is being verified, copy the io_u buffer to GPU memory.
+ This isn't done in the non-verify case because the data would
+ already be in GPU memory in a normal cuFile application.
+ */
+ check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buf,
+ io_u->xfer_buflen,
+ cudaMemcpyHostToDevice), rc);
+ if (rc != 0) {
+ log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
+ io_u->error = EIO;
+ }
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+
+ /*
+ POSIX I/O is being used, the data has to be copied out of the
+ GPU into a CPU buffer. GPU memory doesn't contain the actual
+ data to write, copy the data to the junk buffer. The purpose
+ of this is to add the overhead of cudaMemcpy() that would be
+ present in a POSIX I/O CUDA application.
+ */
+ check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
+ ((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buflen,
+ cudaMemcpyDeviceToHost), rc);
+ if (rc != 0) {
+ log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
+ io_u->error = EIO;
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ assert(0);
+ rc = EINVAL;
+ }
+
+ return rc;
+}
+
+static inline int fio_libcufile_post_read(struct thread_data *td,
+ struct libcufile_options *o,
+ struct io_u *io_u,
+ size_t gpu_offset)
+{
+ int rc = 0;
+
+ if (o->cuda_io == IO_CUFILE) {
+ if (td->o.verify) {
+ /* Copy GPU memory to CPU buffer for verify */
+ check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
+ ((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buflen,
+ cudaMemcpyDeviceToHost), rc);
+ if (rc != 0) {
+ log_err("DDIR_READ cudaMemcpy D2H failed\n");
+ io_u->error = EIO;
+ }
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+ /* POSIX I/O read, copy the CPU buffer to GPU memory */
+ check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buf,
+ io_u->xfer_buflen,
+ cudaMemcpyHostToDevice), rc);
+ if (rc != 0) {
+ log_err("DDIR_READ cudaMemcpy H2D failed\n");
+ io_u->error = EIO;
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ assert(0);
+ rc = EINVAL;
+ }
+
+ return rc;
+}
+
+static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct libcufile_options *o = td->eo;
+ struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
+ unsigned long long io_offset;
+ ssize_t sz;
+ ssize_t remaining;
+ size_t xfered;
+ size_t gpu_offset;
+ int rc;
+
+ if (o->cuda_io == IO_CUFILE && fcd == NULL) {
+ io_u->error = EINVAL;
+ td_verror(td, EINVAL, "xfer");
+ return FIO_Q_COMPLETED;
+ }
+
+ fio_ro_check(td, io_u);
+
+ switch(io_u->ddir) {
+ case DDIR_SYNC:
+ rc = fsync(io_u->file->fd);
+ if (rc != 0) {
+ io_u->error = errno;
+ log_err("fsync: err=%d\n", errno);
+ }
+ break;
+
+ case DDIR_DATASYNC:
+ rc = fdatasync(io_u->file->fd);
+ if (rc != 0) {
+ io_u->error = errno;
+ log_err("fdatasync: err=%d\n", errno);
+ }
+ break;
+
+ case DDIR_READ:
+ case DDIR_WRITE:
+ /*
+ There may be a better way to calculate gpu_offset. The intent is
+ that gpu_offset equals the the difference between io_u->xfer_buf and
+ the page-aligned base address for io_u buffers.
+ */
+ gpu_offset = io_u->index * io_u->xfer_buflen;
+ io_offset = io_u->offset;
+ remaining = io_u->xfer_buflen;
+
+ xfered = 0;
+ sz = 0;
+
+ assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
+
+ if (o->cuda_io == IO_CUFILE) {
+ if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
+ (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
+ log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
+ o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
+ }
+
+ if (!(ALIGNED_4KB(gpu_offset) ||
+ (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
+ log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
+ o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
+ }
+ }
+
+ if (io_u->ddir == DDIR_WRITE)
+ rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
+
+ if (io_u->error != 0)
+ break;
+
+ while (remaining > 0) {
+ assert(gpu_offset + xfered <= o->total_mem);
+ if (io_u->ddir == DDIR_READ) {
+ if (o->cuda_io == IO_CUFILE) {
+ sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
+ io_offset + xfered, gpu_offset + xfered);
+ if (sz == -1) {
+ io_u->error = errno;
+ log_err("cuFileRead: err=%d\n", errno);
+ } else if (sz < 0) {
+ io_u->error = EIO;
+ log_err("cuFileRead: err=%ld:%s\n", sz,
+ cufileop_status_error(-sz));
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+ sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
+ remaining, io_offset + xfered);
+ if (sz < 0) {
+ io_u->error = errno;
+ log_err("pread: err=%d\n", errno);
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ io_u->error = -1;
+ assert(0);
+ }
+ } else if (io_u->ddir == DDIR_WRITE) {
+ if (o->cuda_io == IO_CUFILE) {
+ sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
+ io_offset + xfered, gpu_offset + xfered);
+ if (sz == -1) {
+ io_u->error = errno;
+ log_err("cuFileWrite: err=%d\n", errno);
+ } else if (sz < 0) {
+ io_u->error = EIO;
+ log_err("cuFileWrite: err=%ld:%s\n", sz,
+ cufileop_status_error(-sz));
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+ sz = pwrite(io_u->file->fd,
+ ((char*) io_u->xfer_buf) + xfered,
+ remaining, io_offset + xfered);
+ if (sz < 0) {
+ io_u->error = errno;
+ log_err("pwrite: err=%d\n", errno);
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ io_u->error = -1;
+ assert(0);
+ }
+ } else {
+ log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
+ io_u->error = -1;
+ assert(0);
+ break;
+ }
+
+ if (io_u->error != 0)
+ break;
+
+ remaining -= sz;
+ xfered += sz;
+
+ if (remaining != 0)
+ log_info("Incomplete %s: %ld bytes remaining\n",
+ io_u->ddir == DDIR_READ? "read" : "write", remaining);
+ }
+
+ if (io_u->error != 0)
+ break;
+
+ if (io_u->ddir == DDIR_READ)
+ rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
+ break;
+
+ default:
+ io_u->error = EINVAL;
+ break;
+ }
+
+ if (io_u->error != 0) {
+ log_err("IO failed\n");
+ td_verror(td, io_u->error, "xfer");
+ }
+
+ return FIO_Q_COMPLETED;
+}
+
+static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
+{
+ struct libcufile_options *o = td->eo;
+ struct fio_libcufile_data *fcd = NULL;
+ int rc;
+ CUfileError_t status;
+
+ rc = generic_open_file(td, f);
+ if (rc)
+ return rc;
+
+ if (o->cuda_io == IO_CUFILE) {
+ fcd = calloc(1, sizeof(*fcd));
+ if (fcd == NULL) {
+ rc = ENOMEM;
+ goto exit_err;
+ }
+
+ fcd->cf_descr.handle.fd = f->fd;
+ fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+ status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
+ if (status.err != CU_FILE_SUCCESS) {
+ log_err("cufile register: err=%d:%s\n", status.err,
+ fio_libcufile_get_cuda_error(status));
+ rc = EINVAL;
+ goto exit_err;
+ }
+ }
+
+ FILE_SET_ENG_DATA(f, fcd);
+ return 0;
+
+exit_err:
+ if (fcd) {
+ free(fcd);
+ fcd = NULL;
+ }
+ if (f) {
+ int rc2 = generic_close_file(td, f);
+ if (rc2)
+ log_err("generic_close_file: err=%d\n", rc2);
+ }
+ return rc;
+}
+
+static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
+{
+ struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
+ int rc;
+
+ if (fcd != NULL) {
+ cuFileHandleDeregister(fcd->cf_handle);
+ FILE_SET_ENG_DATA(f, NULL);
+ free(fcd);
+ }
+
+ rc = generic_close_file(td, f);
+
+ return rc;
+}
+
+static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
+{
+ struct libcufile_options *o = td->eo;
+ int rc;
+ CUfileError_t status;
+
+ o->total_mem = total_mem;
+ o->logged = 0;
+ o->cu_mem_ptr = NULL;
+ o->junk_buf = NULL;
+ td->orig_buffer = calloc(1, total_mem);
+ if (!td->orig_buffer) {
+ log_err("orig_buffer calloc failed: err=%d\n", errno);
+ goto exit_error;
+ }
+
+ if (o->cuda_io == IO_POSIX) {
+ o->junk_buf = calloc(1, total_mem);
+ if (o->junk_buf == NULL) {
+ log_err("junk_buf calloc failed: err=%d\n", errno);
+ goto exit_error;
+ }
+ }
+
+ dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
+ check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
+ if (rc != 0)
+ goto exit_error;
+ check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
+ if (rc != 0)
+ goto exit_error;
+
+ if (o->cuda_io == IO_CUFILE) {
+ status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
+ if (status.err != CU_FILE_SUCCESS) {
+ log_err("cuFileBufRegister: err=%d:%s\n", status.err,
+ fio_libcufile_get_cuda_error(status));
+ goto exit_error;
+ }
+ }
+
+ return 0;
+
+exit_error:
+ if (td->orig_buffer) {
+ free(td->orig_buffer);
+ td->orig_buffer = NULL;
+ }
+ if (o->junk_buf) {
+ free(o->junk_buf);
+ o->junk_buf = NULL;
+ }
+ if (o->cu_mem_ptr) {
+ cudaFree(o->cu_mem_ptr);
+ o->cu_mem_ptr = NULL;
+ }
+ return 1;
+}
+
+static void fio_libcufile_iomem_free(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+
+ if (o->junk_buf) {
+ free(o->junk_buf);
+ o->junk_buf = NULL;
+ }
+ if (o->cu_mem_ptr) {
+ if (o->cuda_io == IO_CUFILE)
+ cuFileBufDeregister(o->cu_mem_ptr);
+ cudaFree(o->cu_mem_ptr);
+ o->cu_mem_ptr = NULL;
+ }
+ if (td->orig_buffer) {
+ free(td->orig_buffer);
+ td->orig_buffer = NULL;
+ }
+}
+
+static void fio_libcufile_cleanup(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+
+ pthread_mutex_lock(&running_lock);
+ running--;
+ assert(running >= 0);
+ if (running == 0) {
+ /* only close the driver if initialized and
+ this is the last worker thread */
+ if (o->cuda_io == IO_CUFILE && cufile_initialized)
+ cuFileDriverClose();
+ cufile_initialized = 0;
+ }
+ pthread_mutex_unlock(&running_lock);
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+ .name = "libcufile",
+ .version = FIO_IOOPS_VERSION,
+ .init = fio_libcufile_init,
+ .queue = fio_libcufile_queue,
+ .open_file = fio_libcufile_open_file,
+ .close_file = fio_libcufile_close_file,
+ .iomem_alloc = fio_libcufile_iomem_alloc,
+ .iomem_free = fio_libcufile_iomem_free,
+ .cleanup = fio_libcufile_cleanup,
+ .flags = FIO_SYNCIO,
+ .options = options,
+ .option_struct_size = sizeof(struct libcufile_options)
+};
+
+void fio_init fio_libcufile_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+void fio_exit fio_libcufile_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
diff --git a/engines/libhdfs.c b/engines/libhdfs.c
index 9ca82f78..eb55c3c5 100644
--- a/engines/libhdfs.c
+++ b/engines/libhdfs.c
@@ -240,7 +240,7 @@ int fio_hdfsio_close_file(struct thread_data *td, struct fio_file *f)
return 0;
}
-static int fio_hdfsio_init(struct thread_data *td)
+static int fio_hdfsio_io_u_init(struct thread_data *td, struct io_u *io_u)
{
struct hdfsio_options *options = td->eo;
struct hdfsio_data *hd = td->io_ops_data;
@@ -349,7 +349,7 @@ static int fio_hdfsio_setup(struct thread_data *td)
return 0;
}
-static int fio_hdfsio_io_u_init(struct thread_data *td, struct io_u *io_u)
+static int fio_hdfsio_init(struct thread_data *td)
{
struct hdfsio_data *hd = td->io_ops_data;
struct hdfsio_options *options = td->eo;
diff --git a/engines/libpmem.c b/engines/libpmem.c
index a9b3e29b..ab29a453 100644
--- a/engines/libpmem.c
+++ b/engines/libpmem.c
@@ -2,7 +2,7 @@
* libpmem: IO engine that uses PMDK libpmem to read and write data
*
* Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
- * Copyright 2018-2020, Intel Corporation
+ * Copyright 2018-2021, Intel Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License,
@@ -18,7 +18,8 @@
/*
* libpmem engine
*
- * IO engine that uses libpmem to write data (and memcpy to read)
+ * IO engine that uses libpmem (part of PMDK collection) to write data
+ * and libc's memcpy to read. It requires PMDK >= 1.5.
*
* To use:
* ioengine=libpmem
@@ -43,25 +44,13 @@
* mkdir /mnt/pmem0
* mount -o dax /dev/pmem0 /mnt/pmem0
*
- * See examples/libpmem.fio for more.
- *
- *
- * libpmem.so
- * By default, the libpmem engine will let the system find the libpmem.so
- * that it uses. You can use an alternative libpmem by setting the
- * FIO_PMEM_LIB environment variable to the full path to the desired
- * libpmem.so. This engine requires PMDK >= 1.5.
+ * See examples/libpmem.fio for complete usage example.
*/
#include <stdio.h>
-#include <limits.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <libgen.h>
#include <libpmem.h>
#include "../fio.h"
@@ -77,8 +66,8 @@ static int fio_libpmem_init(struct thread_data *td)
{
struct thread_options *o = &td->o;
- dprint(FD_IO,"o->rw_min_bs %llu \n o->fsync_blocks %u \n o->fdatasync_blocks %u \n",
- o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
+ dprint(FD_IO, "o->rw_min_bs %llu\n o->fsync_blocks %u\n o->fdatasync_blocks %u\n",
+ o->rw_min_bs, o->fsync_blocks, o->fdatasync_blocks);
dprint(FD_IO, "DEBUG fio_libpmem_init\n");
if ((o->rw_min_bs & page_mask) &&
@@ -91,23 +80,17 @@ static int fio_libpmem_init(struct thread_data *td)
}
/*
- * This is the pmem_map_file execution function
+ * This is the pmem_map_file execution function, a helper to
+ * fio_libpmem_open_file function.
*/
static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
size_t length, off_t off)
{
struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
- mode_t mode = 0;
+ mode_t mode = S_IWUSR | S_IRUSR;
size_t mapped_len;
int is_pmem;
- if(td_rw(td))
- mode = S_IWUSR | S_IRUSR;
- else if (td_write(td))
- mode = S_IWUSR;
- else
- mode = S_IRUSR;
-
dprint(FD_IO, "DEBUG fio_libpmem_file\n");
dprint(FD_IO, "f->file_name = %s td->o.verify = %d \n", f->file_name,
td->o.verify);
@@ -142,11 +125,11 @@ static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f)
{
struct fio_libpmem_data *fdd;
- dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
- dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
- dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
- dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
- dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);
+ dprint(FD_IO, "DEBUG fio_libpmem_open_file\n");
+ dprint(FD_IO, "f->io_size=%ld\n", f->io_size);
+ dprint(FD_IO, "td->o.size=%lld\n", td->o.size);
+ dprint(FD_IO, "td->o.iodepth=%d\n", td->o.iodepth);
+ dprint(FD_IO, "td->o.iodepth_batch=%d\n", td->o.iodepth_batch);
if (fio_file_open(f))
td_io_close_file(td, f);
@@ -167,8 +150,8 @@ static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u)
struct fio_file *f = io_u->file;
struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
- dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
- dprint(FD_IO," io_u->offset %llu : fdd->libpmem_off %ld : "
+ dprint(FD_IO, "DEBUG fio_libpmem_prep\n");
+ dprint(FD_IO, "io_u->offset %llu : fdd->libpmem_off %ld : "
"io_u->buflen %llu : fdd->libpmem_sz %ld\n",
io_u->offset, fdd->libpmem_off,
io_u->buflen, fdd->libpmem_sz);
@@ -192,7 +175,9 @@ static enum fio_q_status fio_libpmem_queue(struct thread_data *td,
io_u->error = 0;
dprint(FD_IO, "DEBUG fio_libpmem_queue\n");
- dprint(FD_IO,"td->o.odirect %d td->o.sync_io %d \n",td->o.odirect, td->o.sync_io);
+ dprint(FD_IO, "td->o.odirect %d td->o.sync_io %d\n",
+ td->o.odirect, td->o.sync_io);
+ /* map both O_SYNC / DSYNC to not use NODRAIN */
flags = td->o.sync_io ? 0 : PMEM_F_MEM_NODRAIN;
flags |= td->o.odirect ? PMEM_F_MEM_NONTEMPORAL : PMEM_F_MEM_TEMPORAL;
@@ -202,7 +187,7 @@ static enum fio_q_status fio_libpmem_queue(struct thread_data *td,
break;
case DDIR_WRITE:
dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
- io_u->mmap_data, io_u->xfer_buf );
+ io_u->mmap_data, io_u->xfer_buf);
pmem_memcpy(io_u->mmap_data,
io_u->xfer_buf,
io_u->xfer_buflen,
@@ -226,13 +211,7 @@ static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
int ret = 0;
- dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
- dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
-
- if (!td->o.odirect) {
- dprint(FD_IO,"pmem_drain\n");
- pmem_drain();
- }
+ dprint(FD_IO, "DEBUG fio_libpmem_close_file\n");
if (fdd->libpmem_ptr)
ret = pmem_unmap(fdd->libpmem_ptr, fdd->libpmem_sz);
@@ -254,6 +233,7 @@ FIO_STATIC struct ioengine_ops ioengine = {
.open_file = fio_libpmem_open_file,
.close_file = fio_libpmem_close_file,
.get_file_size = generic_get_file_size,
+ .prepopulate_file = generic_prepopulate_file,
.flags = FIO_SYNCIO | FIO_RAWIO | FIO_DISKLESSIO | FIO_NOEXTEND |
FIO_NODISKUTIL | FIO_BARRIER | FIO_MEMALIGN,
};
diff --git a/engines/librpma_apm.c b/engines/librpma_apm.c
new file mode 100644
index 00000000..ffa3769d
--- /dev/null
+++ b/engines/librpma_apm.c
@@ -0,0 +1,256 @@
+/*
+* librpma_apm: IO engine that uses PMDK librpma to read and write data,
+ * based on Appliance Persistency Method
+ *
+ * Copyright 2020-2021, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include "librpma_fio.h"
+
+/* client side implementation */
+
+static inline int client_io_flush(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len);
+
+static int client_get_io_u_index(struct rpma_completion *cmpl,
+ unsigned int *io_u_index);
+
+static int client_init(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd;
+ unsigned int sq_size;
+ uint32_t cq_size;
+ struct rpma_conn_cfg *cfg = NULL;
+ struct rpma_peer_cfg *pcfg = NULL;
+ int ret;
+
+ /* not supported readwrite = trim / randtrim / trimwrite */
+ if (td_trim(td)) {
+ td_verror(td, EINVAL, "Not supported mode.");
+ return -1;
+ }
+
+ /*
+ * Calculate the required queue sizes where:
+ * - the send queue (SQ) has to be big enough to accommodate
+ * all io_us (WRITEs) and all flush requests (FLUSHes)
+ * - the completion queue (CQ) has to be big enough to accommodate all
+ * success and error completions (cq_size = sq_size)
+ */
+ if (td_random(td) || td_rw(td)) {
+ /*
+ * sq_size = max(rand_read_sq_size, rand_write_sq_size)
+ * where rand_read_sq_size < rand_write_sq_size because read
+ * does not require flush afterwards
+ * rand_write_sq_size = N * (WRITE + FLUSH)
+ *
+ * Note: rw is no different from random write since having
+ * interleaved reads with writes in extreme forces you to flush
+ * as often as when the writes are random.
+ */
+ sq_size = 2 * td->o.iodepth;
+ } else if (td_write(td)) {
+ /* sequential TD_DDIR_WRITE only */
+ if (td->o.sync_io) {
+ sq_size = 2; /* WRITE + FLUSH */
+ } else {
+ /*
+ * N * WRITE + B * FLUSH where:
+ * - B == ceil(iodepth / iodepth_batch)
+ * which is the number of batches for N writes
+ */
+ sq_size = td->o.iodepth + LIBRPMA_FIO_CEIL(td->o.iodepth,
+ td->o.iodepth_batch);
+ }
+ } else {
+ /* TD_DDIR_READ only */
+ if (td->o.sync_io) {
+ sq_size = 1; /* READ */
+ } else {
+ sq_size = td->o.iodepth; /* N x READ */
+ }
+ }
+ cq_size = sq_size;
+
+ /* create a connection configuration object */
+ if ((ret = rpma_conn_cfg_new(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_new");
+ return -1;
+ }
+
+ /* apply queue sizes */
+ if ((ret = rpma_conn_cfg_set_sq_size(cfg, sq_size))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_cq_size(cfg, cq_size))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size");
+ goto err_cfg_delete;
+ }
+
+ if (librpma_fio_client_init(td, cfg))
+ goto err_cfg_delete;
+
+ ccd = td->io_ops_data;
+
+ if (ccd->server_mr_flush_type == RPMA_FLUSH_TYPE_PERSISTENT) {
+ if (!ccd->ws->direct_write_to_pmem) {
+ if (td->thread_number == 1)
+ log_err(
+ "Fio librpma engine will not work until the Direct Write to PMem on the server side is possible (direct_write_to_pmem)\n");
+ goto err_cleanup_common;
+ }
+
+ /* configure peer's direct write to pmem support */
+ if ((ret = rpma_peer_cfg_new(&pcfg))) {
+ librpma_td_verror(td, ret, "rpma_peer_cfg_new");
+ goto err_cleanup_common;
+ }
+
+ if ((ret = rpma_peer_cfg_set_direct_write_to_pmem(pcfg, true))) {
+ librpma_td_verror(td, ret,
+ "rpma_peer_cfg_set_direct_write_to_pmem");
+ (void) rpma_peer_cfg_delete(&pcfg);
+ goto err_cleanup_common;
+ }
+
+ if ((ret = rpma_conn_apply_remote_peer_cfg(ccd->conn, pcfg))) {
+ librpma_td_verror(td, ret,
+ "rpma_conn_apply_remote_peer_cfg");
+ (void) rpma_peer_cfg_delete(&pcfg);
+ goto err_cleanup_common;
+ }
+
+ (void) rpma_peer_cfg_delete(&pcfg);
+ } else if (td->thread_number == 1) {
+ /* XXX log_info mixes with the JSON output */
+ log_err(
+ "Note: Direct Write to PMem is not supported by default nor required if you use DRAM instead of PMem on the server side (direct_write_to_pmem).\n"
+ "Remember that flushing to DRAM does not make your data persistent and may be used only for experimental purposes.\n");
+ }
+
+ if ((ret = rpma_conn_cfg_delete(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_delete");
+ /* non fatal error - continue */
+ }
+
+ ccd->flush = client_io_flush;
+ ccd->get_io_u_index = client_get_io_u_index;
+
+ return 0;
+
+err_cleanup_common:
+ librpma_fio_client_cleanup(td);
+
+err_cfg_delete:
+ (void) rpma_conn_cfg_delete(&cfg);
+
+ return -1;
+}
+
+static void client_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+
+ if (ccd == NULL)
+ return;
+
+ free(ccd->client_data);
+
+ librpma_fio_client_cleanup(td);
+}
+
+static inline int client_io_flush(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ size_t dst_offset = first_io_u->offset;
+ int ret;
+
+ if ((ret = rpma_flush(ccd->conn, ccd->server_mr, dst_offset, len,
+ ccd->server_mr_flush_type, RPMA_F_COMPLETION_ALWAYS,
+ (void *)(uintptr_t)last_io_u->index))) {
+ librpma_td_verror(td, ret, "rpma_flush");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int client_get_io_u_index(struct rpma_completion *cmpl,
+ unsigned int *io_u_index)
+{
+ memcpy(io_u_index, &cmpl->op_context, sizeof(*io_u_index));
+
+ return 1;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_client = {
+ .name = "librpma_apm_client",
+ .version = FIO_IOOPS_VERSION,
+ .init = client_init,
+ .post_init = librpma_fio_client_post_init,
+ .get_file_size = librpma_fio_client_get_file_size,
+ .open_file = librpma_fio_file_nop,
+ .queue = librpma_fio_client_queue,
+ .commit = librpma_fio_client_commit,
+ .getevents = librpma_fio_client_getevents,
+ .event = librpma_fio_client_event,
+ .errdetails = librpma_fio_client_errdetails,
+ .close_file = librpma_fio_file_nop,
+ .cleanup = client_cleanup,
+ .flags = FIO_DISKLESSIO,
+ .options = librpma_fio_options,
+ .option_struct_size = sizeof(struct librpma_fio_options_values),
+};
+
+/* server side implementation */
+
+static int server_open_file(struct thread_data *td, struct fio_file *f)
+{
+ return librpma_fio_server_open_file(td, f, NULL);
+}
+
+static enum fio_q_status server_queue(struct thread_data *td, struct io_u *io_u)
+{
+ return FIO_Q_COMPLETED;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_server = {
+ .name = "librpma_apm_server",
+ .version = FIO_IOOPS_VERSION,
+ .init = librpma_fio_server_init,
+ .open_file = server_open_file,
+ .close_file = librpma_fio_server_close_file,
+ .queue = server_queue,
+ .invalidate = librpma_fio_file_nop,
+ .cleanup = librpma_fio_server_cleanup,
+ .flags = FIO_SYNCIO,
+ .options = librpma_fio_options,
+ .option_struct_size = sizeof(struct librpma_fio_options_values),
+};
+
+/* register both engines */
+
+static void fio_init fio_librpma_apm_register(void)
+{
+ register_ioengine(&ioengine_client);
+ register_ioengine(&ioengine_server);
+}
+
+static void fio_exit fio_librpma_apm_unregister(void)
+{
+ unregister_ioengine(&ioengine_client);
+ unregister_ioengine(&ioengine_server);
+}
diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c
new file mode 100644
index 00000000..3d605ed6
--- /dev/null
+++ b/engines/librpma_fio.c
@@ -0,0 +1,1062 @@
+/*
+ * librpma_fio: librpma_apm and librpma_gpspm engines' common part.
+ *
+ * Copyright 2021, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include "librpma_fio.h"
+
+#include <libpmem.h>
+
+struct fio_option librpma_fio_options[] = {
+ {
+ .name = "serverip",
+ .lname = "rpma_server_ip",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct librpma_fio_options_values, server_ip),
+ .help = "IP address the server is listening on",
+ .def = "",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBRPMA,
+ },
+ {
+ .name = "port",
+ .lname = "rpma_server port",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct librpma_fio_options_values, port),
+ .help = "port the server is listening on",
+ .def = "7204",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBRPMA,
+ },
+ {
+ .name = "direct_write_to_pmem",
+ .lname = "Direct Write to PMem (via RDMA) from the remote host is possible",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct librpma_fio_options_values,
+ direct_write_to_pmem),
+ .help = "Set to true ONLY when Direct Write to PMem from the remote host is possible (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)",
+ .def = "",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBRPMA,
+ },
+ {
+ .name = "busy_wait_polling",
+ .lname = "Set to 0 to wait for completion instead of busy-wait polling completion.",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct librpma_fio_options_values,
+ busy_wait_polling),
+ .help = "Set to false if you want to reduce CPU usage",
+ .def = "1",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBRPMA,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+int librpma_fio_td_port(const char *port_base_str, struct thread_data *td,
+ char *port_out)
+{
+ unsigned long int port_ul = strtoul(port_base_str, NULL, 10);
+ unsigned int port_new;
+
+ port_out[0] = '\0';
+
+ if (port_ul == ULONG_MAX) {
+ td_verror(td, errno, "strtoul");
+ return -1;
+ }
+ port_ul += td->thread_number - 1;
+ if (port_ul >= UINT_MAX) {
+ log_err("[%u] port number (%lu) bigger than UINT_MAX\n",
+ td->thread_number, port_ul);
+ return -1;
+ }
+
+ port_new = port_ul;
+ snprintf(port_out, LIBRPMA_FIO_PORT_STR_LEN_MAX - 1, "%u", port_new);
+
+ return 0;
+}
+
+char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
+ struct librpma_fio_mem *mem)
+{
+ char *mem_ptr = NULL;
+ int ret;
+
+ if ((ret = posix_memalign((void **)&mem_ptr, page_size, size))) {
+ log_err("fio: posix_memalign() failed\n");
+ td_verror(td, ret, "posix_memalign");
+ return NULL;
+ }
+
+ mem->mem_ptr = mem_ptr;
+ mem->size_mmap = 0;
+
+ return mem_ptr;
+}
+
+char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
+ size_t size, struct librpma_fio_mem *mem)
+{
+ size_t size_mmap = 0;
+ char *mem_ptr = NULL;
+ int is_pmem = 0;
+ size_t ws_offset;
+
+ if (size % page_size) {
+ log_err("fio: size (%zu) is not aligned to page size (%zu)\n",
+ size, page_size);
+ return NULL;
+ }
+
+ ws_offset = (td->thread_number - 1) * size;
+
+ if (!filename) {
+ log_err("fio: filename is not set\n");
+ return NULL;
+ }
+
+ /* map the file */
+ mem_ptr = pmem_map_file(filename, 0 /* len */, 0 /* flags */,
+ 0 /* mode */, &size_mmap, &is_pmem);
+ if (mem_ptr == NULL) {
+ log_err("fio: pmem_map_file(%s) failed\n", filename);
+ /* pmem_map_file() sets errno on failure */
+ td_verror(td, errno, "pmem_map_file");
+ return NULL;
+ }
+
+ /* pmem is expected */
+ if (!is_pmem) {
+ log_err("fio: %s is not located in persistent memory\n",
+ filename);
+ goto err_unmap;
+ }
+
+ /* check size of allocated persistent memory */
+ if (size_mmap < ws_offset + size) {
+ log_err(
+ "fio: %s is too small to handle so many threads (%zu < %zu)\n",
+ filename, size_mmap, ws_offset + size);
+ goto err_unmap;
+ }
+
+ log_info("fio: size of memory mapped from the file %s: %zu\n",
+ filename, size_mmap);
+
+ mem->mem_ptr = mem_ptr;
+ mem->size_mmap = size_mmap;
+
+ return mem_ptr + ws_offset;
+
+err_unmap:
+ (void) pmem_unmap(mem_ptr, size_mmap);
+ return NULL;
+}
+
+void librpma_fio_free(struct librpma_fio_mem *mem)
+{
+ if (mem->size_mmap)
+ (void) pmem_unmap(mem->mem_ptr, mem->size_mmap);
+ else
+ free(mem->mem_ptr);
+}
+
+#define LIBRPMA_FIO_RETRY_MAX_NO 10
+#define LIBRPMA_FIO_RETRY_DELAY_S 5
+
+int librpma_fio_client_init(struct thread_data *td,
+ struct rpma_conn_cfg *cfg)
+{
+ struct librpma_fio_client_data *ccd;
+ struct librpma_fio_options_values *o = td->eo;
+ struct ibv_context *dev = NULL;
+ char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX];
+ struct rpma_conn_req *req = NULL;
+ enum rpma_conn_event event;
+ struct rpma_conn_private_data pdata;
+ enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING;
+ int remote_flush_type;
+ int retry;
+ int ret;
+
+ /* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */
+#ifdef FIO_INC_DEBUG
+ if ((1UL << FD_NET) & fio_debug)
+ log_level_aux = RPMA_LOG_LEVEL_INFO;
+#endif
+
+ /* configure logging thresholds to see more details */
+ rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
+ rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux);
+
+ /* obtain an IBV context for a remote IP address */
+ if ((ret = rpma_utils_get_ibv_context(o->server_ip,
+ RPMA_UTIL_IBV_CONTEXT_REMOTE, &dev))) {
+ librpma_td_verror(td, ret, "rpma_utils_get_ibv_context");
+ return -1;
+ }
+
+ /* allocate client's data */
+ ccd = calloc(1, sizeof(*ccd));
+ if (ccd == NULL) {
+ td_verror(td, errno, "calloc");
+ return -1;
+ }
+
+ /* allocate all in-memory queues */
+ ccd->io_us_queued = calloc(td->o.iodepth, sizeof(*ccd->io_us_queued));
+ if (ccd->io_us_queued == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_free_ccd;
+ }
+
+ ccd->io_us_flight = calloc(td->o.iodepth, sizeof(*ccd->io_us_flight));
+ if (ccd->io_us_flight == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_free_io_u_queues;
+ }
+
+ ccd->io_us_completed = calloc(td->o.iodepth,
+ sizeof(*ccd->io_us_completed));
+ if (ccd->io_us_completed == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_free_io_u_queues;
+ }
+
+ /* create a new peer object */
+ if ((ret = rpma_peer_new(dev, &ccd->peer))) {
+ librpma_td_verror(td, ret, "rpma_peer_new");
+ goto err_free_io_u_queues;
+ }
+
+ /* create a connection request */
+ if (librpma_fio_td_port(o->port, td, port_td))
+ goto err_peer_delete;
+
+ for (retry = 0; retry < LIBRPMA_FIO_RETRY_MAX_NO; retry++) {
+ if ((ret = rpma_conn_req_new(ccd->peer, o->server_ip, port_td,
+ cfg, &req))) {
+ librpma_td_verror(td, ret, "rpma_conn_req_new");
+ goto err_peer_delete;
+ }
+
+ /*
+ * Connect the connection request
+ * and obtain the connection object.
+ */
+ if ((ret = rpma_conn_req_connect(&req, NULL, &ccd->conn))) {
+ librpma_td_verror(td, ret, "rpma_conn_req_connect");
+ goto err_req_delete;
+ }
+
+ /* wait for the connection to establish */
+ if ((ret = rpma_conn_next_event(ccd->conn, &event))) {
+ librpma_td_verror(td, ret, "rpma_conn_next_event");
+ goto err_conn_delete;
+ } else if (event == RPMA_CONN_ESTABLISHED) {
+ break;
+ } else if (event == RPMA_CONN_REJECTED) {
+ (void) rpma_conn_disconnect(ccd->conn);
+ (void) rpma_conn_delete(&ccd->conn);
+ if (retry < LIBRPMA_FIO_RETRY_MAX_NO - 1) {
+ log_err("Thread [%d]: Retrying (#%i) ...\n",
+ td->thread_number, retry + 1);
+ sleep(LIBRPMA_FIO_RETRY_DELAY_S);
+ } else {
+ log_err(
+ "Thread [%d]: The maximum number of retries exceeded. Closing.\n",
+ td->thread_number);
+ }
+ } else {
+ log_err(
+ "rpma_conn_next_event returned an unexptected event: (%s != RPMA_CONN_ESTABLISHED)\n",
+ rpma_utils_conn_event_2str(event));
+ goto err_conn_delete;
+ }
+ }
+
+ if (retry > 0)
+ log_err("Thread [%d]: Connected after retry #%i\n",
+ td->thread_number, retry);
+
+ if (ccd->conn == NULL)
+ goto err_peer_delete;
+
+ /* get the connection's private data sent from the server */
+ if ((ret = rpma_conn_get_private_data(ccd->conn, &pdata))) {
+ librpma_td_verror(td, ret, "rpma_conn_get_private_data");
+ goto err_conn_delete;
+ }
+
+ /* get the server's workspace representation */
+ ccd->ws = pdata.ptr;
+
+ /* create the server's memory representation */
+ if ((ret = rpma_mr_remote_from_descriptor(&ccd->ws->descriptor[0],
+ ccd->ws->mr_desc_size, &ccd->server_mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_remote_from_descriptor");
+ goto err_conn_delete;
+ }
+
+ /* get the total size of the shared server memory */
+ if ((ret = rpma_mr_remote_get_size(ccd->server_mr, &ccd->ws_size))) {
+ librpma_td_verror(td, ret, "rpma_mr_remote_get_size");
+ goto err_conn_delete;
+ }
+
+ /* get flush type of the remote node */
+ if ((ret = rpma_mr_remote_get_flush_type(ccd->server_mr,
+ &remote_flush_type))) {
+ librpma_td_verror(td, ret, "rpma_mr_remote_get_flush_type");
+ goto err_conn_delete;
+ }
+
+ ccd->server_mr_flush_type =
+ (remote_flush_type & RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT) ?
+ RPMA_FLUSH_TYPE_PERSISTENT : RPMA_FLUSH_TYPE_VISIBILITY;
+
+ /*
+ * Assure an io_us buffer allocation is page-size-aligned which is required
+ * to register for RDMA. User-provided value is intentionally ignored.
+ */
+ td->o.mem_align = page_size;
+
+ td->io_ops_data = ccd;
+
+ return 0;
+
+err_conn_delete:
+ (void) rpma_conn_disconnect(ccd->conn);
+ (void) rpma_conn_delete(&ccd->conn);
+
+err_req_delete:
+ (void) rpma_conn_req_delete(&req);
+
+err_peer_delete:
+ (void) rpma_peer_delete(&ccd->peer);
+
+err_free_io_u_queues:
+ free(ccd->io_us_queued);
+ free(ccd->io_us_flight);
+ free(ccd->io_us_completed);
+
+err_free_ccd:
+ free(ccd);
+
+ return -1;
+}
+
+void librpma_fio_client_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ enum rpma_conn_event ev;
+ int ret;
+
+ if (ccd == NULL)
+ return;
+
+ /* delete the iou's memory registration */
+ if ((ret = rpma_mr_dereg(&ccd->orig_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_dereg");
+ /* delete the iou's memory registration */
+ if ((ret = rpma_mr_remote_delete(&ccd->server_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_remote_delete");
+ /* initiate disconnection */
+ if ((ret = rpma_conn_disconnect(ccd->conn)))
+ librpma_td_verror(td, ret, "rpma_conn_disconnect");
+ /* wait for disconnection to end up */
+ if ((ret = rpma_conn_next_event(ccd->conn, &ev))) {
+ librpma_td_verror(td, ret, "rpma_conn_next_event");
+ } else if (ev != RPMA_CONN_CLOSED) {
+ log_err(
+ "client_cleanup received an unexpected event (%s != RPMA_CONN_CLOSED)\n",
+ rpma_utils_conn_event_2str(ev));
+ }
+ /* delete the connection */
+ if ((ret = rpma_conn_delete(&ccd->conn)))
+ librpma_td_verror(td, ret, "rpma_conn_delete");
+ /* delete the peer */
+ if ((ret = rpma_peer_delete(&ccd->peer)))
+ librpma_td_verror(td, ret, "rpma_peer_delete");
+ /* free the software queues */
+ free(ccd->io_us_queued);
+ free(ccd->io_us_flight);
+ free(ccd->io_us_completed);
+ free(ccd);
+ td->io_ops_data = NULL; /* zero ccd */
+}
+
+int librpma_fio_file_nop(struct thread_data *td, struct fio_file *f)
+{
+ /* NOP */
+ return 0;
+}
+
+int librpma_fio_client_post_init(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ size_t io_us_size;
+ int ret;
+
+ /*
+ * td->orig_buffer is not aligned. The engine requires aligned io_us
+ * so FIO alignes up the address using the formula below.
+ */
+ ccd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) +
+ td->o.mem_align;
+
+ /*
+ * td->orig_buffer_size beside the space really consumed by io_us
+ * has paddings which can be omitted for the memory registration.
+ */
+ io_us_size = (unsigned long long)td_max_bs(td) *
+ (unsigned long long)td->o.iodepth;
+
+ if ((ret = rpma_mr_reg(ccd->peer, ccd->orig_buffer_aligned, io_us_size,
+ RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC |
+ RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC |
+ RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT, &ccd->orig_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_reg");
+ return ret;
+}
+
+int librpma_fio_client_get_file_size(struct thread_data *td,
+ struct fio_file *f)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+
+ f->real_file_size = ccd->ws_size;
+ fio_file_set_size_known(f);
+
+ return 0;
+}
+
+static enum fio_q_status client_queue_sync(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct rpma_completion cmpl;
+ unsigned io_u_index;
+ int ret;
+
+ /* execute io_u */
+ if (io_u->ddir == DDIR_READ) {
+ /* post an RDMA read operation */
+ if (librpma_fio_client_io_read(td, io_u,
+ RPMA_F_COMPLETION_ALWAYS))
+ goto err;
+ } else if (io_u->ddir == DDIR_WRITE) {
+ /* post an RDMA write operation */
+ if (librpma_fio_client_io_write(td, io_u))
+ goto err;
+ if (ccd->flush(td, io_u, io_u, io_u->xfer_buflen))
+ goto err;
+ } else {
+ log_err("unsupported IO mode: %s\n", io_ddir_name(io_u->ddir));
+ goto err;
+ }
+
+ do {
+ /* get a completion */
+ ret = rpma_conn_completion_get(ccd->conn, &cmpl);
+ if (ret == RPMA_E_NO_COMPLETION) {
+ /* lack of completion is not an error */
+ continue;
+ } else if (ret != 0) {
+ /* an error occurred */
+ librpma_td_verror(td, ret, "rpma_conn_completion_get");
+ goto err;
+ }
+
+ /* if io_us has completed with an error */
+ if (cmpl.op_status != IBV_WC_SUCCESS)
+ goto err;
+
+ if (cmpl.op == RPMA_OP_SEND)
+ ++ccd->op_send_completed;
+ else {
+ if (cmpl.op == RPMA_OP_RECV)
+ ++ccd->op_recv_completed;
+
+ break;
+ }
+ } while (1);
+
+ if (ccd->get_io_u_index(&cmpl, &io_u_index) != 1)
+ goto err;
+
+ if (io_u->index != io_u_index) {
+ log_err(
+ "no matching io_u for received completion found (io_u_index=%u)\n",
+ io_u_index);
+ goto err;
+ }
+
+ /* make sure all SENDs are completed before exit - clean up SQ */
+ if (librpma_fio_client_io_complete_all_sends(td))
+ goto err;
+
+ return FIO_Q_COMPLETED;
+
+err:
+ io_u->error = -1;
+ return FIO_Q_COMPLETED;
+}
+
+enum fio_q_status librpma_fio_client_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+
+ if (ccd->io_u_queued_nr == (int)td->o.iodepth)
+ return FIO_Q_BUSY;
+
+ if (td->o.sync_io)
+ return client_queue_sync(td, io_u);
+
+ /* io_u -> queued[] */
+ ccd->io_us_queued[ccd->io_u_queued_nr] = io_u;
+ ccd->io_u_queued_nr++;
+
+ return FIO_Q_QUEUED;
+}
+
+int librpma_fio_client_commit(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ int flags = RPMA_F_COMPLETION_ON_ERROR;
+ struct timespec now;
+ bool fill_time;
+ int i;
+ struct io_u *flush_first_io_u = NULL;
+ unsigned long long int flush_len = 0;
+
+ if (!ccd->io_us_queued)
+ return -1;
+
+ /* execute all io_us from queued[] */
+ for (i = 0; i < ccd->io_u_queued_nr; i++) {
+ struct io_u *io_u = ccd->io_us_queued[i];
+
+ if (io_u->ddir == DDIR_READ) {
+ if (i + 1 == ccd->io_u_queued_nr ||
+ ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE)
+ flags = RPMA_F_COMPLETION_ALWAYS;
+ /* post an RDMA read operation */
+ if (librpma_fio_client_io_read(td, io_u, flags))
+ return -1;
+ } else if (io_u->ddir == DDIR_WRITE) {
+ /* post an RDMA write operation */
+ if (librpma_fio_client_io_write(td, io_u))
+ return -1;
+
+ /* cache the first io_u in the sequence */
+ if (flush_first_io_u == NULL)
+ flush_first_io_u = io_u;
+
+ /*
+ * the flush length is the sum of all io_u's creating
+ * the sequence
+ */
+ flush_len += io_u->xfer_buflen;
+
+ /*
+ * if io_u's are random the rpma_flush is required
+ * after each one of them
+ */
+ if (!td_random(td)) {
+ /*
+ * When the io_u's are sequential and
+ * the current io_u is not the last one and
+ * the next one is also a write operation
+ * the flush can be postponed by one io_u and
+ * cover all of them which build a continuous
+ * sequence.
+ */
+ if ((i + 1 < ccd->io_u_queued_nr) &&
+ (ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE))
+ continue;
+ }
+
+ /* flush all writes which build a continuous sequence */
+ if (ccd->flush(td, flush_first_io_u, io_u, flush_len))
+ return -1;
+
+ /*
+ * reset the flush parameters in preparation for
+ * the next one
+ */
+ flush_first_io_u = NULL;
+ flush_len = 0;
+ } else {
+ log_err("unsupported IO mode: %s\n",
+ io_ddir_name(io_u->ddir));
+ return -1;
+ }
+ }
+
+ if ((fill_time = fio_fill_issue_time(td)))
+ fio_gettime(&now, NULL);
+
+ /* move executed io_us from queued[] to flight[] */
+ for (i = 0; i < ccd->io_u_queued_nr; i++) {
+ struct io_u *io_u = ccd->io_us_queued[i];
+
+ /* FIO does not do this if the engine is asynchronous */
+ if (fill_time)
+ memcpy(&io_u->issue_time, &now, sizeof(now));
+
+ /* move executed io_us from queued[] to flight[] */
+ ccd->io_us_flight[ccd->io_u_flight_nr] = io_u;
+ ccd->io_u_flight_nr++;
+
+ /*
+ * FIO says:
+ * If an engine has the commit hook
+ * it has to call io_u_queued() itself.
+ */
+ io_u_queued(td, io_u);
+ }
+
+ /* FIO does not do this if an engine has the commit hook. */
+ io_u_mark_submit(td, ccd->io_u_queued_nr);
+ ccd->io_u_queued_nr = 0;
+
+ return 0;
+}
+
+/*
+ * RETURN VALUE
+ * - > 0 - a number of completed io_us
+ * - 0 - when no complicitions received
+ * - (-1) - when an error occurred
+ */
+static int client_getevent_process(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct rpma_completion cmpl;
+ /* io_u->index of completed io_u (cmpl.op_context) */
+ unsigned int io_u_index;
+ /* # of completed io_us */
+ int cmpl_num = 0;
+ /* helpers */
+ struct io_u *io_u;
+ int i;
+ int ret;
+
+ /* get a completion */
+ if ((ret = rpma_conn_completion_get(ccd->conn, &cmpl))) {
+ /* lack of completion is not an error */
+ if (ret == RPMA_E_NO_COMPLETION) {
+ /* lack of completion is not an error */
+ return 0;
+ }
+
+ /* an error occurred */
+ librpma_td_verror(td, ret, "rpma_conn_completion_get");
+ return -1;
+ }
+
+ /* if io_us has completed with an error */
+ if (cmpl.op_status != IBV_WC_SUCCESS) {
+ td->error = cmpl.op_status;
+ return -1;
+ }
+
+ if (cmpl.op == RPMA_OP_SEND)
+ ++ccd->op_send_completed;
+ else if (cmpl.op == RPMA_OP_RECV)
+ ++ccd->op_recv_completed;
+
+ if ((ret = ccd->get_io_u_index(&cmpl, &io_u_index)) != 1)
+ return ret;
+
+ /* look for an io_u being completed */
+ for (i = 0; i < ccd->io_u_flight_nr; ++i) {
+ if (ccd->io_us_flight[i]->index == io_u_index) {
+ cmpl_num = i + 1;
+ break;
+ }
+ }
+
+ /* if no matching io_u has been found */
+ if (cmpl_num == 0) {
+ log_err(
+ "no matching io_u for received completion found (io_u_index=%u)\n",
+ io_u_index);
+ return -1;
+ }
+
+ /* move completed io_us to the completed in-memory queue */
+ for (i = 0; i < cmpl_num; ++i) {
+ /* get and prepare io_u */
+ io_u = ccd->io_us_flight[i];
+
+ /* append to the queue */
+ ccd->io_us_completed[ccd->io_u_completed_nr] = io_u;
+ ccd->io_u_completed_nr++;
+ }
+
+ /* remove completed io_us from the flight queue */
+ for (i = cmpl_num; i < ccd->io_u_flight_nr; ++i)
+ ccd->io_us_flight[i - cmpl_num] = ccd->io_us_flight[i];
+ ccd->io_u_flight_nr -= cmpl_num;
+
+ return cmpl_num;
+}
+
+int librpma_fio_client_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ /* total # of completed io_us */
+ int cmpl_num_total = 0;
+ /* # of completed io_us from a single event */
+ int cmpl_num;
+
+ do {
+ cmpl_num = client_getevent_process(td);
+ if (cmpl_num > 0) {
+ /* new completions collected */
+ cmpl_num_total += cmpl_num;
+ } else if (cmpl_num == 0) {
+ /*
+ * It is required to make sure that CQEs for SENDs
+ * will flow at least at the same pace as CQEs for RECVs.
+ */
+ if (cmpl_num_total >= min &&
+ ccd->op_send_completed >= ccd->op_recv_completed)
+ break;
+
+ /*
+ * To reduce CPU consumption one can use
+ * the rpma_conn_completion_wait() function.
+ * Note this greatly increase the latency
+ * and make the results less stable.
+ * The bandwidth stays more or less the same.
+ */
+ } else {
+ /* an error occurred */
+ return -1;
+ }
+
+ /*
+ * The expected max can be exceeded if CQEs for RECVs will come up
+ * faster than CQEs for SENDs. But it is required to make sure CQEs for
+ * SENDs will flow at least at the same pace as CQEs for RECVs.
+ */
+ } while (cmpl_num_total < max ||
+ ccd->op_send_completed < ccd->op_recv_completed);
+
+ /*
+ * All posted SENDs are completed and RECVs for them (responses) are
+ * completed. This is the initial situation so the counters are reset.
+ */
+ if (ccd->op_send_posted == ccd->op_send_completed &&
+ ccd->op_send_completed == ccd->op_recv_completed) {
+ ccd->op_send_posted = 0;
+ ccd->op_send_completed = 0;
+ ccd->op_recv_completed = 0;
+ }
+
+ return cmpl_num_total;
+}
+
+struct io_u *librpma_fio_client_event(struct thread_data *td, int event)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct io_u *io_u;
+ int i;
+
+ /* get the first io_u from the queue */
+ io_u = ccd->io_us_completed[0];
+
+ /* remove the first io_u from the queue */
+ for (i = 1; i < ccd->io_u_completed_nr; ++i)
+ ccd->io_us_completed[i - 1] = ccd->io_us_completed[i];
+ ccd->io_u_completed_nr--;
+
+ dprint_io_u(io_u, "client_event");
+
+ return io_u;
+}
+
+char *librpma_fio_client_errdetails(struct io_u *io_u)
+{
+ /* get the string representation of an error */
+ enum ibv_wc_status status = io_u->error;
+ const char *status_str = ibv_wc_status_str(status);
+
+ char *details = strdup(status_str);
+ if (details == NULL) {
+ fprintf(stderr, "Error: %s\n", status_str);
+ fprintf(stderr, "Fatal error: out of memory. Aborting.\n");
+ abort();
+ }
+
+ /* FIO frees the returned string when it becomes obsolete */
+ return details;
+}
+
+int librpma_fio_server_init(struct thread_data *td)
+{
+ struct librpma_fio_options_values *o = td->eo;
+ struct librpma_fio_server_data *csd;
+ struct ibv_context *dev = NULL;
+ enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING;
+ int ret = -1;
+
+ /* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */
+#ifdef FIO_INC_DEBUG
+ if ((1UL << FD_NET) & fio_debug)
+ log_level_aux = RPMA_LOG_LEVEL_INFO;
+#endif
+
+ /* configure logging thresholds to see more details */
+ rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
+ rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux);
+
+
+ /* obtain an IBV context for a remote IP address */
+ if ((ret = rpma_utils_get_ibv_context(o->server_ip,
+ RPMA_UTIL_IBV_CONTEXT_LOCAL, &dev))) {
+ librpma_td_verror(td, ret, "rpma_utils_get_ibv_context");
+ return -1;
+ }
+
+ /* allocate server's data */
+ csd = calloc(1, sizeof(*csd));
+ if (csd == NULL) {
+ td_verror(td, errno, "calloc");
+ return -1;
+ }
+
+ /* create a new peer object */
+ if ((ret = rpma_peer_new(dev, &csd->peer))) {
+ librpma_td_verror(td, ret, "rpma_peer_new");
+ goto err_free_csd;
+ }
+
+ td->io_ops_data = csd;
+
+ return 0;
+
+err_free_csd:
+ free(csd);
+
+ return -1;
+}
+
+void librpma_fio_server_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ int ret;
+
+ if (csd == NULL)
+ return;
+
+ /* free the peer */
+ if ((ret = rpma_peer_delete(&csd->peer)))
+ librpma_td_verror(td, ret, "rpma_peer_delete");
+
+ free(csd);
+}
+
+int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
+ struct rpma_conn_cfg *cfg)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct librpma_fio_options_values *o = td->eo;
+ enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED;
+ struct librpma_fio_workspace ws = {0};
+ struct rpma_conn_private_data pdata;
+ uint32_t max_msg_num;
+ struct rpma_conn_req *conn_req;
+ struct rpma_conn *conn;
+ struct rpma_mr_local *mr;
+ char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX];
+ struct rpma_ep *ep;
+ size_t mem_size = td->o.size;
+ size_t mr_desc_size;
+ void *ws_ptr;
+ int usage_mem_type;
+ int ret;
+
+ if (!f->file_name) {
+ log_err("fio: filename is not set\n");
+ return -1;
+ }
+
+ /* start a listening endpoint at addr:port */
+ if (librpma_fio_td_port(o->port, td, port_td))
+ return -1;
+
+ if ((ret = rpma_ep_listen(csd->peer, o->server_ip, port_td, &ep))) {
+ librpma_td_verror(td, ret, "rpma_ep_listen");
+ return -1;
+ }
+
+ if (strcmp(f->file_name, "malloc") == 0) {
+ /* allocation from DRAM using posix_memalign() */
+ ws_ptr = librpma_fio_allocate_dram(td, mem_size, &csd->mem);
+ usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY;
+ } else {
+ /* allocation from PMEM using pmem_map_file() */
+ ws_ptr = librpma_fio_allocate_pmem(td, f->file_name,
+ mem_size, &csd->mem);
+ usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT;
+ }
+
+ if (ws_ptr == NULL)
+ goto err_ep_shutdown;
+
+ f->real_file_size = mem_size;
+
+ if ((ret = rpma_mr_reg(csd->peer, ws_ptr, mem_size,
+ RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC |
+ RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC |
+ usage_mem_type, &mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_reg");
+ goto err_free;
+ }
+
+ /* get size of the memory region's descriptor */
+ if ((ret = rpma_mr_get_descriptor_size(mr, &mr_desc_size))) {
+ librpma_td_verror(td, ret, "rpma_mr_get_descriptor_size");
+ goto err_mr_dereg;
+ }
+
+ /* verify size of the memory region's descriptor */
+ if (mr_desc_size > LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE) {
+ log_err(
+ "size of the memory region's descriptor is too big (max=%i)\n",
+ LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE);
+ goto err_mr_dereg;
+ }
+
+ /* get the memory region's descriptor */
+ if ((ret = rpma_mr_get_descriptor(mr, &ws.descriptor[0]))) {
+ librpma_td_verror(td, ret, "rpma_mr_get_descriptor");
+ goto err_mr_dereg;
+ }
+
+ if (cfg != NULL) {
+ if ((ret = rpma_conn_cfg_get_rq_size(cfg, &max_msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_get_rq_size");
+ goto err_mr_dereg;
+ }
+
+ /* verify whether iodepth fits into uint16_t */
+ if (max_msg_num > UINT16_MAX) {
+ log_err("fio: iodepth too big (%u > %u)\n",
+ max_msg_num, UINT16_MAX);
+ return -1;
+ }
+
+ ws.max_msg_num = max_msg_num;
+ }
+
+ /* prepare a workspace description */
+ ws.direct_write_to_pmem = o->direct_write_to_pmem;
+ ws.mr_desc_size = mr_desc_size;
+ pdata.ptr = &ws;
+ pdata.len = sizeof(ws);
+
+ /* receive an incoming connection request */
+ if ((ret = rpma_ep_next_conn_req(ep, cfg, &conn_req))) {
+ librpma_td_verror(td, ret, "rpma_ep_next_conn_req");
+ goto err_mr_dereg;
+ }
+
+ if (csd->prepare_connection && csd->prepare_connection(td, conn_req))
+ goto err_req_delete;
+
+ /* accept the connection request and obtain the connection object */
+ if ((ret = rpma_conn_req_connect(&conn_req, &pdata, &conn))) {
+ librpma_td_verror(td, ret, "rpma_conn_req_connect");
+ goto err_req_delete;
+ }
+
+ /* wait for the connection to be established */
+ if ((ret = rpma_conn_next_event(conn, &conn_event))) {
+ librpma_td_verror(td, ret, "rpma_conn_next_event");
+ goto err_conn_delete;
+ } else if (conn_event != RPMA_CONN_ESTABLISHED) {
+ log_err("rpma_conn_next_event returned an unexptected event\n");
+ goto err_conn_delete;
+ }
+
+ /* end-point is no longer needed */
+ (void) rpma_ep_shutdown(&ep);
+
+ csd->ws_mr = mr;
+ csd->ws_ptr = ws_ptr;
+ csd->conn = conn;
+
+ return 0;
+
+err_conn_delete:
+ (void) rpma_conn_delete(&conn);
+
+err_req_delete:
+ (void) rpma_conn_req_delete(&conn_req);
+
+err_mr_dereg:
+ (void) rpma_mr_dereg(&mr);
+
+err_free:
+ librpma_fio_free(&csd->mem);
+
+err_ep_shutdown:
+ (void) rpma_ep_shutdown(&ep);
+
+ return -1;
+}
+
+int librpma_fio_server_close_file(struct thread_data *td, struct fio_file *f)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED;
+ int rv = 0;
+ int ret;
+
+ /* wait for the connection to be closed */
+ ret = rpma_conn_next_event(csd->conn, &conn_event);
+ if (!ret && conn_event != RPMA_CONN_CLOSED) {
+ log_err("rpma_conn_next_event returned an unexptected event\n");
+ rv = -1;
+ }
+
+ if ((ret = rpma_conn_disconnect(csd->conn))) {
+ librpma_td_verror(td, ret, "rpma_conn_disconnect");
+ rv = -1;
+ }
+
+ if ((ret = rpma_conn_delete(&csd->conn))) {
+ librpma_td_verror(td, ret, "rpma_conn_delete");
+ rv = -1;
+ }
+
+ if ((ret = rpma_mr_dereg(&csd->ws_mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_dereg");
+ rv = -1;
+ }
+
+ librpma_fio_free(&csd->mem);
+
+ return rv;
+}
diff --git a/engines/librpma_fio.h b/engines/librpma_fio.h
new file mode 100644
index 00000000..fb89d99d
--- /dev/null
+++ b/engines/librpma_fio.h
@@ -0,0 +1,275 @@
+/*
+ * librpma_fio: librpma_apm and librpma_gpspm engines' common header.
+ *
+ * Copyright 2021, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef LIBRPMA_FIO_H
+#define LIBRPMA_FIO_H 1
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#include <librpma.h>
+
+/* servers' and clients' common */
+
+#define librpma_td_verror(td, err, func) \
+ td_vmsg((td), (err), rpma_err_2str(err), (func))
+
+/* ceil(a / b) = (a + b - 1) / b */
+#define LIBRPMA_FIO_CEIL(a, b) (((a) + (b) - 1) / (b))
+
+/* common option structure for server and client */
+struct librpma_fio_options_values {
+ /*
+ * FIO considers .off1 == 0 absent so the first meaningful field has to
+ * have padding ahead of it.
+ */
+ void *pad;
+ char *server_ip;
+ /* base server listening port */
+ char *port;
+ /* Direct Write to PMem is possible */
+ unsigned int direct_write_to_pmem;
+ /* Set to 0 to wait for completion instead of busy-wait polling completion. */
+ unsigned int busy_wait_polling;
+};
+
+extern struct fio_option librpma_fio_options[];
+
+/*
+ * Limited by the maximum length of the private data
+ * for rdma_connect() in case of RDMA_PS_TCP (28 bytes).
+ */
+#define LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE 24
+
+struct librpma_fio_workspace {
+ uint16_t max_msg_num; /* # of RQ slots */
+ uint8_t direct_write_to_pmem; /* Direct Write to PMem is possible */
+ uint8_t mr_desc_size; /* size of mr_desc in descriptor[] */
+ /* buffer containing mr_desc */
+ char descriptor[LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE];
+};
+
+#define LIBRPMA_FIO_PORT_STR_LEN_MAX 12
+
+int librpma_fio_td_port(const char *port_base_str, struct thread_data *td,
+ char *port_out);
+
+struct librpma_fio_mem {
+ /* memory buffer */
+ char *mem_ptr;
+
+ /* size of the mapped persistent memory */
+ size_t size_mmap;
+};
+
+char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
+ struct librpma_fio_mem *mem);
+
+char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
+ size_t size, struct librpma_fio_mem *mem);
+
+void librpma_fio_free(struct librpma_fio_mem *mem);
+
+/* clients' common */
+
+typedef int (*librpma_fio_flush_t)(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len);
+
+/*
+ * RETURN VALUE
+ * - ( 1) - on success
+ * - ( 0) - skip
+ * - (-1) - on error
+ */
+typedef int (*librpma_fio_get_io_u_index_t)(struct rpma_completion *cmpl,
+ unsigned int *io_u_index);
+
+struct librpma_fio_client_data {
+ struct rpma_peer *peer;
+ struct rpma_conn *conn;
+
+ /* aligned td->orig_buffer */
+ char *orig_buffer_aligned;
+
+ /* ious's base address memory registration (cd->orig_buffer_aligned) */
+ struct rpma_mr_local *orig_mr;
+
+ struct librpma_fio_workspace *ws;
+
+ /* a server's memory representation */
+ struct rpma_mr_remote *server_mr;
+ enum rpma_flush_type server_mr_flush_type;
+
+ /* remote workspace description */
+ size_t ws_size;
+
+ /* in-memory queues */
+ struct io_u **io_us_queued;
+ int io_u_queued_nr;
+ struct io_u **io_us_flight;
+ int io_u_flight_nr;
+ struct io_u **io_us_completed;
+ int io_u_completed_nr;
+
+ /* SQ control. Note: all of them have to be kept in sync. */
+ uint32_t op_send_posted;
+ uint32_t op_send_completed;
+ uint32_t op_recv_completed;
+
+ librpma_fio_flush_t flush;
+ librpma_fio_get_io_u_index_t get_io_u_index;
+
+ /* engine-specific client data */
+ void *client_data;
+};
+
+int librpma_fio_client_init(struct thread_data *td,
+ struct rpma_conn_cfg *cfg);
+void librpma_fio_client_cleanup(struct thread_data *td);
+
+int librpma_fio_file_nop(struct thread_data *td, struct fio_file *f);
+int librpma_fio_client_get_file_size(struct thread_data *td,
+ struct fio_file *f);
+
+int librpma_fio_client_post_init(struct thread_data *td);
+
+enum fio_q_status librpma_fio_client_queue(struct thread_data *td,
+ struct io_u *io_u);
+
+int librpma_fio_client_commit(struct thread_data *td);
+
+int librpma_fio_client_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t);
+
+struct io_u *librpma_fio_client_event(struct thread_data *td, int event);
+
+char *librpma_fio_client_errdetails(struct io_u *io_u);
+
+static inline int librpma_fio_client_io_read(struct thread_data *td,
+ struct io_u *io_u, int flags)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ size_t dst_offset = (char *)(io_u->xfer_buf) - ccd->orig_buffer_aligned;
+ size_t src_offset = io_u->offset;
+ int ret;
+
+ if ((ret = rpma_read(ccd->conn, ccd->orig_mr, dst_offset,
+ ccd->server_mr, src_offset, io_u->xfer_buflen,
+ flags, (void *)(uintptr_t)io_u->index))) {
+ librpma_td_verror(td, ret, "rpma_read");
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int librpma_fio_client_io_write(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ size_t src_offset = (char *)(io_u->xfer_buf) - ccd->orig_buffer_aligned;
+ size_t dst_offset = io_u->offset;
+ int ret;
+
+ if ((ret = rpma_write(ccd->conn, ccd->server_mr, dst_offset,
+ ccd->orig_mr, src_offset, io_u->xfer_buflen,
+ RPMA_F_COMPLETION_ON_ERROR,
+ (void *)(uintptr_t)io_u->index))) {
+ librpma_td_verror(td, ret, "rpma_write");
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int librpma_fio_client_io_complete_all_sends(
+ struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct rpma_completion cmpl;
+ int ret;
+
+ while (ccd->op_send_posted != ccd->op_send_completed) {
+ /* get a completion */
+ ret = rpma_conn_completion_get(ccd->conn, &cmpl);
+ if (ret == RPMA_E_NO_COMPLETION) {
+ /* lack of completion is not an error */
+ continue;
+ } else if (ret != 0) {
+ /* an error occurred */
+ librpma_td_verror(td, ret, "rpma_conn_completion_get");
+ break;
+ }
+
+ if (cmpl.op_status != IBV_WC_SUCCESS)
+ return -1;
+
+ if (cmpl.op == RPMA_OP_SEND)
+ ++ccd->op_send_completed;
+ else {
+ log_err(
+ "A completion other than RPMA_OP_SEND got during cleaning up the CQ from SENDs\n");
+ return -1;
+ }
+ }
+
+ /*
+ * All posted SENDs are completed and RECVs for them (responses) are
+ * completed. This is the initial situation so the counters are reset.
+ */
+ if (ccd->op_send_posted == ccd->op_send_completed &&
+ ccd->op_send_completed == ccd->op_recv_completed) {
+ ccd->op_send_posted = 0;
+ ccd->op_send_completed = 0;
+ ccd->op_recv_completed = 0;
+ }
+
+ return 0;
+}
+
+/* servers' common */
+
+typedef int (*librpma_fio_prepare_connection_t)(
+ struct thread_data *td,
+ struct rpma_conn_req *conn_req);
+
+struct librpma_fio_server_data {
+ struct rpma_peer *peer;
+
+ /* resources of an incoming connection */
+ struct rpma_conn *conn;
+
+ char *ws_ptr;
+ struct rpma_mr_local *ws_mr;
+ struct librpma_fio_mem mem;
+
+ /* engine-specific server data */
+ void *server_data;
+
+ librpma_fio_prepare_connection_t prepare_connection;
+};
+
+int librpma_fio_server_init(struct thread_data *td);
+
+void librpma_fio_server_cleanup(struct thread_data *td);
+
+int librpma_fio_server_open_file(struct thread_data *td,
+ struct fio_file *f, struct rpma_conn_cfg *cfg);
+
+int librpma_fio_server_close_file(struct thread_data *td,
+ struct fio_file *f);
+
+#endif /* LIBRPMA_FIO_H */
diff --git a/engines/librpma_gpspm.c b/engines/librpma_gpspm.c
new file mode 100644
index 00000000..74147709
--- /dev/null
+++ b/engines/librpma_gpspm.c
@@ -0,0 +1,776 @@
+/*
+ * librpma_gpspm: IO engine that uses PMDK librpma to write data,
+ * based on General Purpose Server Persistency Method
+ *
+ * Copyright 2020-2021, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include "librpma_fio.h"
+
+#include <libpmem.h>
+
+/* Generated by the protocol buffer compiler from: librpma_gpspm_flush.proto */
+#include "librpma_gpspm_flush.pb-c.h"
+
+#define MAX_MSG_SIZE (512)
+#define IO_U_BUF_LEN (2 * MAX_MSG_SIZE)
+#define SEND_OFFSET (0)
+#define RECV_OFFSET (SEND_OFFSET + MAX_MSG_SIZE)
+
+#define GPSPM_FLUSH_REQUEST__LAST \
+ { PROTOBUF_C_MESSAGE_INIT(&gpspm_flush_request__descriptor), 0, 0, 0 }
+
+/*
+ * 'Flush_req_last' is the last flush request
+ * the client has to send to server to indicate
+ * that the client is done.
+ */
+static const GPSPMFlushRequest Flush_req_last = GPSPM_FLUSH_REQUEST__LAST;
+
+#define IS_NOT_THE_LAST_MESSAGE(flush_req) \
+ (flush_req->length != Flush_req_last.length || \
+ flush_req->offset != Flush_req_last.offset)
+
+/* client side implementation */
+
+/* get next io_u message buffer in the round-robin fashion */
+#define IO_U_NEXT_BUF_OFF_CLIENT(cd) \
+ (IO_U_BUF_LEN * ((cd->msg_curr++) % cd->msg_num))
+
+struct client_data {
+ /* memory for sending and receiving buffered */
+ char *io_us_msgs;
+
+ /* resources for messaging buffer */
+ uint32_t msg_num;
+ uint32_t msg_curr;
+ struct rpma_mr_local *msg_mr;
+};
+
+static inline int client_io_flush(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len);
+
+static int client_get_io_u_index(struct rpma_completion *cmpl,
+ unsigned int *io_u_index);
+
+static int client_init(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd;
+ struct client_data *cd;
+ uint32_t write_num;
+ struct rpma_conn_cfg *cfg = NULL;
+ int ret;
+
+ /*
+ * not supported:
+ * - readwrite = read / trim / randread / randtrim /
+ * / rw / randrw / trimwrite
+ */
+ if (td_read(td) || td_trim(td)) {
+ td_verror(td, EINVAL, "Not supported mode.");
+ return -1;
+ }
+
+ /* allocate client's data */
+ cd = calloc(1, sizeof(*cd));
+ if (cd == NULL) {
+ td_verror(td, errno, "calloc");
+ return -1;
+ }
+
+ /*
+ * Calculate the required number of WRITEs and FLUSHes.
+ *
+ * Note: Each flush is a request (SEND) and response (RECV) pair.
+ */
+ if (td_random(td)) {
+ write_num = td->o.iodepth; /* WRITE * N */
+ cd->msg_num = td->o.iodepth; /* FLUSH * N */
+ } else {
+ if (td->o.sync_io) {
+ write_num = 1; /* WRITE */
+ cd->msg_num = 1; /* FLUSH */
+ } else {
+ write_num = td->o.iodepth; /* WRITE * N */
+ /*
+ * FLUSH * B where:
+ * - B == ceil(iodepth / iodepth_batch)
+ * which is the number of batches for N writes
+ */
+ cd->msg_num = LIBRPMA_FIO_CEIL(td->o.iodepth,
+ td->o.iodepth_batch);
+ }
+ }
+
+ /* create a connection configuration object */
+ if ((ret = rpma_conn_cfg_new(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_new");
+ goto err_free_cd;
+ }
+
+ /*
+ * Calculate the required queue sizes where:
+ * - the send queue (SQ) has to be big enough to accommodate
+ * all io_us (WRITEs) and all flush requests (SENDs)
+ * - the receive queue (RQ) has to be big enough to accommodate
+ * all flush responses (RECVs)
+ * - the completion queue (CQ) has to be big enough to accommodate all
+ * success and error completions (sq_size + rq_size)
+ */
+ if ((ret = rpma_conn_cfg_set_sq_size(cfg, write_num + cd->msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_rq_size(cfg, cd->msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_rq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_cq_size(cfg, write_num + cd->msg_num * 2))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size");
+ goto err_cfg_delete;
+ }
+
+ if (librpma_fio_client_init(td, cfg))
+ goto err_cfg_delete;
+
+ ccd = td->io_ops_data;
+
+ if (ccd->ws->direct_write_to_pmem &&
+ ccd->server_mr_flush_type == RPMA_FLUSH_TYPE_PERSISTENT &&
+ td->thread_number == 1) {
+ /* XXX log_info mixes with the JSON output */
+ log_err(
+ "Note: The server side supports Direct Write to PMem and it is equipped with PMem (direct_write_to_pmem).\n"
+ "You can use librpma_client and librpma_server engines for better performance instead of GPSPM.\n");
+ }
+
+ /* validate the server's RQ capacity */
+ if (cd->msg_num > ccd->ws->max_msg_num) {
+ log_err(
+ "server's RQ size (iodepth) too small to handle the client's workspace requirements (%u < %u)\n",
+ ccd->ws->max_msg_num, cd->msg_num);
+ goto err_cleanup_common;
+ }
+
+ if ((ret = rpma_conn_cfg_delete(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_delete");
+ /* non fatal error - continue */
+ }
+
+ ccd->flush = client_io_flush;
+ ccd->get_io_u_index = client_get_io_u_index;
+ ccd->client_data = cd;
+
+ return 0;
+
+err_cleanup_common:
+ librpma_fio_client_cleanup(td);
+
+err_cfg_delete:
+ (void) rpma_conn_cfg_delete(&cfg);
+
+err_free_cd:
+ free(cd);
+
+ return -1;
+}
+
+static int client_post_init(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct client_data *cd = ccd->client_data;
+ unsigned int io_us_msgs_size;
+ int ret;
+
+ /* message buffers initialization and registration */
+ io_us_msgs_size = cd->msg_num * IO_U_BUF_LEN;
+ if ((ret = posix_memalign((void **)&cd->io_us_msgs, page_size,
+ io_us_msgs_size))) {
+ td_verror(td, ret, "posix_memalign");
+ return ret;
+ }
+ if ((ret = rpma_mr_reg(ccd->peer, cd->io_us_msgs, io_us_msgs_size,
+ RPMA_MR_USAGE_SEND | RPMA_MR_USAGE_RECV,
+ &cd->msg_mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_reg");
+ return ret;
+ }
+
+ return librpma_fio_client_post_init(td);
+}
+
+static void client_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct client_data *cd;
+ size_t flush_req_size;
+ size_t io_u_buf_off;
+ size_t send_offset;
+ void *send_ptr;
+ int ret;
+
+ if (ccd == NULL)
+ return;
+
+ cd = ccd->client_data;
+ if (cd == NULL) {
+ librpma_fio_client_cleanup(td);
+ return;
+ }
+
+ /*
+ * Make sure all SEND completions are collected ergo there are free
+ * slots in the SQ for the last SEND message.
+ *
+ * Note: If any operation will fail we still can send the termination
+ * notice.
+ */
+ (void) librpma_fio_client_io_complete_all_sends(td);
+
+ /* prepare the last flush message and pack it to the send buffer */
+ flush_req_size = gpspm_flush_request__get_packed_size(&Flush_req_last);
+ if (flush_req_size > MAX_MSG_SIZE) {
+ log_err(
+ "Packed flush request size is bigger than available send buffer space (%zu > %d\n",
+ flush_req_size, MAX_MSG_SIZE);
+ } else {
+ io_u_buf_off = IO_U_NEXT_BUF_OFF_CLIENT(cd);
+ send_offset = io_u_buf_off + SEND_OFFSET;
+ send_ptr = cd->io_us_msgs + send_offset;
+ (void) gpspm_flush_request__pack(&Flush_req_last, send_ptr);
+
+ /* send the flush message */
+ if ((ret = rpma_send(ccd->conn, cd->msg_mr, send_offset,
+ flush_req_size, RPMA_F_COMPLETION_ALWAYS,
+ NULL)))
+ librpma_td_verror(td, ret, "rpma_send");
+
+ ++ccd->op_send_posted;
+
+ /* Wait for the SEND to complete */
+ (void) librpma_fio_client_io_complete_all_sends(td);
+ }
+
+ /* deregister the messaging buffer memory */
+ if ((ret = rpma_mr_dereg(&cd->msg_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_dereg");
+
+ free(ccd->client_data);
+
+ librpma_fio_client_cleanup(td);
+}
+
+static inline int client_io_flush(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct client_data *cd = ccd->client_data;
+ size_t io_u_buf_off = IO_U_NEXT_BUF_OFF_CLIENT(cd);
+ size_t send_offset = io_u_buf_off + SEND_OFFSET;
+ size_t recv_offset = io_u_buf_off + RECV_OFFSET;
+ void *send_ptr = cd->io_us_msgs + send_offset;
+ void *recv_ptr = cd->io_us_msgs + recv_offset;
+ GPSPMFlushRequest flush_req = GPSPM_FLUSH_REQUEST__INIT;
+ size_t flush_req_size = 0;
+ int ret;
+
+ /* prepare a response buffer */
+ if ((ret = rpma_recv(ccd->conn, cd->msg_mr, recv_offset, MAX_MSG_SIZE,
+ recv_ptr))) {
+ librpma_td_verror(td, ret, "rpma_recv");
+ return -1;
+ }
+
+ /* prepare a flush message and pack it to a send buffer */
+ flush_req.offset = first_io_u->offset;
+ flush_req.length = len;
+ flush_req.op_context = last_io_u->index;
+ flush_req_size = gpspm_flush_request__get_packed_size(&flush_req);
+ if (flush_req_size > MAX_MSG_SIZE) {
+ log_err(
+ "Packed flush request size is bigger than available send buffer space (%"
+ PRIu64 " > %d\n", flush_req_size, MAX_MSG_SIZE);
+ return -1;
+ }
+ (void) gpspm_flush_request__pack(&flush_req, send_ptr);
+
+ /* send the flush message */
+ if ((ret = rpma_send(ccd->conn, cd->msg_mr, send_offset, flush_req_size,
+ RPMA_F_COMPLETION_ALWAYS, NULL))) {
+ librpma_td_verror(td, ret, "rpma_send");
+ return -1;
+ }
+
+ ++ccd->op_send_posted;
+
+ return 0;
+}
+
+static int client_get_io_u_index(struct rpma_completion *cmpl,
+ unsigned int *io_u_index)
+{
+ GPSPMFlushResponse *flush_resp;
+
+ if (cmpl->op != RPMA_OP_RECV)
+ return 0;
+
+ /* unpack a response from the received buffer */
+ flush_resp = gpspm_flush_response__unpack(NULL,
+ cmpl->byte_len, cmpl->op_context);
+ if (flush_resp == NULL) {
+ log_err("Cannot unpack the flush response buffer\n");
+ return -1;
+ }
+
+ memcpy(io_u_index, &flush_resp->op_context, sizeof(*io_u_index));
+
+ gpspm_flush_response__free_unpacked(flush_resp, NULL);
+
+ return 1;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_client = {
+ .name = "librpma_gpspm_client",
+ .version = FIO_IOOPS_VERSION,
+ .init = client_init,
+ .post_init = client_post_init,
+ .get_file_size = librpma_fio_client_get_file_size,
+ .open_file = librpma_fio_file_nop,
+ .queue = librpma_fio_client_queue,
+ .commit = librpma_fio_client_commit,
+ .getevents = librpma_fio_client_getevents,
+ .event = librpma_fio_client_event,
+ .errdetails = librpma_fio_client_errdetails,
+ .close_file = librpma_fio_file_nop,
+ .cleanup = client_cleanup,
+ .flags = FIO_DISKLESSIO,
+ .options = librpma_fio_options,
+ .option_struct_size = sizeof(struct librpma_fio_options_values),
+};
+
+/* server side implementation */
+
+#define IO_U_BUFF_OFF_SERVER(i) (i * IO_U_BUF_LEN)
+
+struct server_data {
+ /* aligned td->orig_buffer */
+ char *orig_buffer_aligned;
+
+ /* resources for messaging buffer from DRAM allocated by fio */
+ struct rpma_mr_local *msg_mr;
+
+ uint32_t msg_sqe_available; /* # of free SQ slots */
+
+ /* in-memory queues */
+ struct rpma_completion *msgs_queued;
+ uint32_t msg_queued_nr;
+};
+
+static int server_init(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd;
+ struct server_data *sd;
+ int ret = -1;
+
+ if ((ret = librpma_fio_server_init(td)))
+ return ret;
+
+ csd = td->io_ops_data;
+
+ /* allocate server's data */
+ sd = calloc(1, sizeof(*sd));
+ if (sd == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_server_cleanup;
+ }
+
+ /* allocate in-memory queue */
+ sd->msgs_queued = calloc(td->o.iodepth, sizeof(*sd->msgs_queued));
+ if (sd->msgs_queued == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_free_sd;
+ }
+
+ /*
+ * Assure a single io_u buffer can store both SEND and RECV messages and
+ * an io_us buffer allocation is page-size-aligned which is required
+ * to register for RDMA. User-provided values are intentionally ignored.
+ */
+ td->o.max_bs[DDIR_READ] = IO_U_BUF_LEN;
+ td->o.mem_align = page_size;
+
+ csd->server_data = sd;
+
+ return 0;
+
+err_free_sd:
+ free(sd);
+
+err_server_cleanup:
+ librpma_fio_server_cleanup(td);
+
+ return -1;
+}
+
+static int server_post_init(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ size_t io_us_size;
+ size_t io_u_buflen;
+ int ret;
+
+ /*
+ * td->orig_buffer is not aligned. The engine requires aligned io_us
+ * so FIO alignes up the address using the formula below.
+ */
+ sd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) +
+ td->o.mem_align;
+
+ /*
+ * XXX
+ * Each io_u message buffer contains recv and send messages.
+ * Aligning each of those buffers may potentially give
+ * some performance benefits.
+ */
+ io_u_buflen = td_max_bs(td);
+
+ /* check whether io_u buffer is big enough */
+ if (io_u_buflen < IO_U_BUF_LEN) {
+ log_err(
+ "blocksize too small to accommodate assumed maximal request/response pair size (%" PRIu64 " < %d)\n",
+ io_u_buflen, IO_U_BUF_LEN);
+ return -1;
+ }
+
+ /*
+ * td->orig_buffer_size beside the space really consumed by io_us
+ * has paddings which can be omitted for the memory registration.
+ */
+ io_us_size = (unsigned long long)io_u_buflen *
+ (unsigned long long)td->o.iodepth;
+
+ if ((ret = rpma_mr_reg(csd->peer, sd->orig_buffer_aligned, io_us_size,
+ RPMA_MR_USAGE_SEND | RPMA_MR_USAGE_RECV,
+ &sd->msg_mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_reg");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void server_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd;
+ int ret;
+
+ if (csd == NULL)
+ return;
+
+ sd = csd->server_data;
+
+ if (sd != NULL) {
+ /* rpma_mr_dereg(messaging buffer from DRAM) */
+ if ((ret = rpma_mr_dereg(&sd->msg_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_dereg");
+
+ free(sd->msgs_queued);
+ free(sd);
+ }
+
+ librpma_fio_server_cleanup(td);
+}
+
+static int prepare_connection(struct thread_data *td,
+ struct rpma_conn_req *conn_req)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ int ret;
+ int i;
+
+ /* prepare buffers for a flush requests */
+ sd->msg_sqe_available = td->o.iodepth;
+ for (i = 0; i < td->o.iodepth; i++) {
+ size_t offset_recv_msg = IO_U_BUFF_OFF_SERVER(i) + RECV_OFFSET;
+ if ((ret = rpma_conn_req_recv(conn_req, sd->msg_mr,
+ offset_recv_msg, MAX_MSG_SIZE,
+ (const void *)(uintptr_t)i))) {
+ librpma_td_verror(td, ret, "rpma_conn_req_recv");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int server_open_file(struct thread_data *td, struct fio_file *f)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct rpma_conn_cfg *cfg = NULL;
+ uint16_t max_msg_num = td->o.iodepth;
+ int ret;
+
+ csd->prepare_connection = prepare_connection;
+
+ /* create a connection configuration object */
+ if ((ret = rpma_conn_cfg_new(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_new");
+ return -1;
+ }
+
+ /*
+ * Calculate the required queue sizes where:
+ * - the send queue (SQ) has to be big enough to accommodate
+ * all possible flush requests (SENDs)
+ * - the receive queue (RQ) has to be big enough to accommodate
+ * all flush responses (RECVs)
+ * - the completion queue (CQ) has to be big enough to accommodate
+ * all success and error completions (sq_size + rq_size)
+ */
+ if ((ret = rpma_conn_cfg_set_sq_size(cfg, max_msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_rq_size(cfg, max_msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_rq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_cq_size(cfg, max_msg_num * 2))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size");
+ goto err_cfg_delete;
+ }
+
+ ret = librpma_fio_server_open_file(td, f, cfg);
+
+err_cfg_delete:
+ (void) rpma_conn_cfg_delete(&cfg);
+
+ return ret;
+}
+
+static int server_qe_process(struct thread_data *td,
+ struct rpma_completion *cmpl)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ GPSPMFlushRequest *flush_req;
+ GPSPMFlushResponse flush_resp = GPSPM_FLUSH_RESPONSE__INIT;
+ size_t flush_resp_size = 0;
+ size_t send_buff_offset;
+ size_t recv_buff_offset;
+ size_t io_u_buff_offset;
+ void *send_buff_ptr;
+ void *recv_buff_ptr;
+ void *op_ptr;
+ int msg_index;
+ int ret;
+
+ /* calculate SEND/RECV pair parameters */
+ msg_index = (int)(uintptr_t)cmpl->op_context;
+ io_u_buff_offset = IO_U_BUFF_OFF_SERVER(msg_index);
+ send_buff_offset = io_u_buff_offset + SEND_OFFSET;
+ recv_buff_offset = io_u_buff_offset + RECV_OFFSET;
+ send_buff_ptr = sd->orig_buffer_aligned + send_buff_offset;
+ recv_buff_ptr = sd->orig_buffer_aligned + recv_buff_offset;
+
+ /* unpack a flush request from the received buffer */
+ flush_req = gpspm_flush_request__unpack(NULL, cmpl->byte_len,
+ recv_buff_ptr);
+ if (flush_req == NULL) {
+ log_err("cannot unpack the flush request buffer\n");
+ goto err_terminate;
+ }
+
+ if (IS_NOT_THE_LAST_MESSAGE(flush_req)) {
+ op_ptr = csd->ws_ptr + flush_req->offset;
+ pmem_persist(op_ptr, flush_req->length);
+ } else {
+ /*
+ * This is the last message - the client is done.
+ */
+ gpspm_flush_request__free_unpacked(flush_req, NULL);
+ td->done = true;
+ return 0;
+ }
+
+ /* initiate the next receive operation */
+ if ((ret = rpma_recv(csd->conn, sd->msg_mr, recv_buff_offset,
+ MAX_MSG_SIZE,
+ (const void *)(uintptr_t)msg_index))) {
+ librpma_td_verror(td, ret, "rpma_recv");
+ goto err_free_unpacked;
+ }
+
+ /* prepare a flush response and pack it to a send buffer */
+ flush_resp.op_context = flush_req->op_context;
+ flush_resp_size = gpspm_flush_response__get_packed_size(&flush_resp);
+ if (flush_resp_size > MAX_MSG_SIZE) {
+ log_err(
+ "Size of the packed flush response is bigger than the available space of the send buffer (%"
+ PRIu64 " > %i\n", flush_resp_size, MAX_MSG_SIZE);
+ goto err_free_unpacked;
+ }
+
+ (void) gpspm_flush_response__pack(&flush_resp, send_buff_ptr);
+
+ /* send the flush response */
+ if ((ret = rpma_send(csd->conn, sd->msg_mr, send_buff_offset,
+ flush_resp_size, RPMA_F_COMPLETION_ALWAYS, NULL))) {
+ librpma_td_verror(td, ret, "rpma_send");
+ goto err_free_unpacked;
+ }
+ --sd->msg_sqe_available;
+
+ gpspm_flush_request__free_unpacked(flush_req, NULL);
+
+ return 0;
+
+err_free_unpacked:
+ gpspm_flush_request__free_unpacked(flush_req, NULL);
+
+err_terminate:
+ td->terminate = true;
+
+ return -1;
+}
+
+static inline int server_queue_process(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ int ret;
+ int i;
+
+ /* min(# of queue entries, # of SQ entries available) */
+ uint32_t qes_to_process = min(sd->msg_queued_nr, sd->msg_sqe_available);
+ if (qes_to_process == 0)
+ return 0;
+
+ /* process queued completions */
+ for (i = 0; i < qes_to_process; ++i) {
+ if ((ret = server_qe_process(td, &sd->msgs_queued[i])))
+ return ret;
+ }
+
+ /* progress the queue */
+ for (i = 0; i < sd->msg_queued_nr - qes_to_process; ++i) {
+ memcpy(&sd->msgs_queued[i],
+ &sd->msgs_queued[qes_to_process + i],
+ sizeof(sd->msgs_queued[i]));
+ }
+
+ sd->msg_queued_nr -= qes_to_process;
+
+ return 0;
+}
+
+static int server_cmpl_process(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ struct rpma_completion *cmpl = &sd->msgs_queued[sd->msg_queued_nr];
+ struct librpma_fio_options_values *o = td->eo;
+ int ret;
+
+ ret = rpma_conn_completion_get(csd->conn, cmpl);
+ if (ret == RPMA_E_NO_COMPLETION) {
+ if (o->busy_wait_polling == 0) {
+ ret = rpma_conn_completion_wait(csd->conn);
+ if (ret == RPMA_E_NO_COMPLETION) {
+ /* lack of completion is not an error */
+ return 0;
+ } else if (ret != 0) {
+ librpma_td_verror(td, ret, "rpma_conn_completion_wait");
+ goto err_terminate;
+ }
+
+ ret = rpma_conn_completion_get(csd->conn, cmpl);
+ if (ret == RPMA_E_NO_COMPLETION) {
+ /* lack of completion is not an error */
+ return 0;
+ } else if (ret != 0) {
+ librpma_td_verror(td, ret, "rpma_conn_completion_get");
+ goto err_terminate;
+ }
+ } else {
+ /* lack of completion is not an error */
+ return 0;
+ }
+ } else if (ret != 0) {
+ librpma_td_verror(td, ret, "rpma_conn_completion_get");
+ goto err_terminate;
+ }
+
+ /* validate the completion */
+ if (cmpl->op_status != IBV_WC_SUCCESS)
+ goto err_terminate;
+
+ if (cmpl->op == RPMA_OP_RECV)
+ ++sd->msg_queued_nr;
+ else if (cmpl->op == RPMA_OP_SEND)
+ ++sd->msg_sqe_available;
+
+ return 0;
+
+err_terminate:
+ td->terminate = true;
+
+ return -1;
+}
+
+static enum fio_q_status server_queue(struct thread_data *td, struct io_u *io_u)
+{
+ do {
+ if (server_cmpl_process(td))
+ return FIO_Q_BUSY;
+
+ if (server_queue_process(td))
+ return FIO_Q_BUSY;
+
+ } while (!td->done);
+
+ return FIO_Q_COMPLETED;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_server = {
+ .name = "librpma_gpspm_server",
+ .version = FIO_IOOPS_VERSION,
+ .init = server_init,
+ .post_init = server_post_init,
+ .open_file = server_open_file,
+ .close_file = librpma_fio_server_close_file,
+ .queue = server_queue,
+ .invalidate = librpma_fio_file_nop,
+ .cleanup = server_cleanup,
+ .flags = FIO_SYNCIO,
+ .options = librpma_fio_options,
+ .option_struct_size = sizeof(struct librpma_fio_options_values),
+};
+
+/* register both engines */
+
+static void fio_init fio_librpma_gpspm_register(void)
+{
+ register_ioengine(&ioengine_client);
+ register_ioengine(&ioengine_server);
+}
+
+static void fio_exit fio_librpma_gpspm_unregister(void)
+{
+ unregister_ioengine(&ioengine_client);
+ unregister_ioengine(&ioengine_server);
+}
diff --git a/engines/librpma_gpspm_flush.pb-c.c b/engines/librpma_gpspm_flush.pb-c.c
new file mode 100644
index 00000000..3ff24756
--- /dev/null
+++ b/engines/librpma_gpspm_flush.pb-c.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2020, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Generated by the protocol buffer compiler. DO NOT EDIT! */
+/* Generated from: librpma_gpspm_flush.proto */
+
+/* Do not generate deprecated warnings for self */
+#ifndef PROTOBUF_C__NO_DEPRECATED
+#define PROTOBUF_C__NO_DEPRECATED
+#endif
+
+#include "librpma_gpspm_flush.pb-c.h"
+void gpspm_flush_request__init
+ (GPSPMFlushRequest *message)
+{
+ static const GPSPMFlushRequest init_value = GPSPM_FLUSH_REQUEST__INIT;
+ *message = init_value;
+}
+size_t gpspm_flush_request__get_packed_size
+ (const GPSPMFlushRequest *message)
+{
+ assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+ return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t gpspm_flush_request__pack
+ (const GPSPMFlushRequest *message,
+ uint8_t *out)
+{
+ assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+ return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t gpspm_flush_request__pack_to_buffer
+ (const GPSPMFlushRequest *message,
+ ProtobufCBuffer *buffer)
+{
+ assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+ return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+GPSPMFlushRequest *
+ gpspm_flush_request__unpack
+ (ProtobufCAllocator *allocator,
+ size_t len,
+ const uint8_t *data)
+{
+ return (GPSPMFlushRequest *)
+ protobuf_c_message_unpack (&gpspm_flush_request__descriptor,
+ allocator, len, data);
+}
+void gpspm_flush_request__free_unpacked
+ (GPSPMFlushRequest *message,
+ ProtobufCAllocator *allocator)
+{
+ if(!message)
+ return;
+ assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+ protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+void gpspm_flush_response__init
+ (GPSPMFlushResponse *message)
+{
+ static const GPSPMFlushResponse init_value = GPSPM_FLUSH_RESPONSE__INIT;
+ *message = init_value;
+}
+size_t gpspm_flush_response__get_packed_size
+ (const GPSPMFlushResponse *message)
+{
+ assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+ return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t gpspm_flush_response__pack
+ (const GPSPMFlushResponse *message,
+ uint8_t *out)
+{
+ assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+ return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t gpspm_flush_response__pack_to_buffer
+ (const GPSPMFlushResponse *message,
+ ProtobufCBuffer *buffer)
+{
+ assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+ return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+GPSPMFlushResponse *
+ gpspm_flush_response__unpack
+ (ProtobufCAllocator *allocator,
+ size_t len,
+ const uint8_t *data)
+{
+ return (GPSPMFlushResponse *)
+ protobuf_c_message_unpack (&gpspm_flush_response__descriptor,
+ allocator, len, data);
+}
+void gpspm_flush_response__free_unpacked
+ (GPSPMFlushResponse *message,
+ ProtobufCAllocator *allocator)
+{
+ if(!message)
+ return;
+ assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+ protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+static const ProtobufCFieldDescriptor gpspm_flush_request__field_descriptors[3] =
+{
+ {
+ "offset",
+ 1,
+ PROTOBUF_C_LABEL_REQUIRED,
+ PROTOBUF_C_TYPE_FIXED64,
+ 0, /* quantifier_offset */
+ offsetof(GPSPMFlushRequest, offset),
+ NULL,
+ NULL,
+ 0, /* flags */
+ 0,NULL,NULL /* reserved1,reserved2, etc */
+ },
+ {
+ "length",
+ 2,
+ PROTOBUF_C_LABEL_REQUIRED,
+ PROTOBUF_C_TYPE_FIXED64,
+ 0, /* quantifier_offset */
+ offsetof(GPSPMFlushRequest, length),
+ NULL,
+ NULL,
+ 0, /* flags */
+ 0,NULL,NULL /* reserved1,reserved2, etc */
+ },
+ {
+ "op_context",
+ 3,
+ PROTOBUF_C_LABEL_REQUIRED,
+ PROTOBUF_C_TYPE_FIXED64,
+ 0, /* quantifier_offset */
+ offsetof(GPSPMFlushRequest, op_context),
+ NULL,
+ NULL,
+ 0, /* flags */
+ 0,NULL,NULL /* reserved1,reserved2, etc */
+ },
+};
+static const unsigned gpspm_flush_request__field_indices_by_name[] = {
+ 1, /* field[1] = length */
+ 0, /* field[0] = offset */
+ 2, /* field[2] = op_context */
+};
+static const ProtobufCIntRange gpspm_flush_request__number_ranges[1 + 1] =
+{
+ { 1, 0 },
+ { 0, 3 }
+};
+const ProtobufCMessageDescriptor gpspm_flush_request__descriptor =
+{
+ PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+ "GPSPM_flush_request",
+ "GPSPMFlushRequest",
+ "GPSPMFlushRequest",
+ "",
+ sizeof(GPSPMFlushRequest),
+ 3,
+ gpspm_flush_request__field_descriptors,
+ gpspm_flush_request__field_indices_by_name,
+ 1, gpspm_flush_request__number_ranges,
+ (ProtobufCMessageInit) gpspm_flush_request__init,
+ NULL,NULL,NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor gpspm_flush_response__field_descriptors[1] =
+{
+ {
+ "op_context",
+ 1,
+ PROTOBUF_C_LABEL_REQUIRED,
+ PROTOBUF_C_TYPE_FIXED64,
+ 0, /* quantifier_offset */
+ offsetof(GPSPMFlushResponse, op_context),
+ NULL,
+ NULL,
+ 0, /* flags */
+ 0,NULL,NULL /* reserved1,reserved2, etc */
+ },
+};
+static const unsigned gpspm_flush_response__field_indices_by_name[] = {
+ 0, /* field[0] = op_context */
+};
+static const ProtobufCIntRange gpspm_flush_response__number_ranges[1 + 1] =
+{
+ { 1, 0 },
+ { 0, 1 }
+};
+const ProtobufCMessageDescriptor gpspm_flush_response__descriptor =
+{
+ PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+ "GPSPM_flush_response",
+ "GPSPMFlushResponse",
+ "GPSPMFlushResponse",
+ "",
+ sizeof(GPSPMFlushResponse),
+ 1,
+ gpspm_flush_response__field_descriptors,
+ gpspm_flush_response__field_indices_by_name,
+ 1, gpspm_flush_response__number_ranges,
+ (ProtobufCMessageInit) gpspm_flush_response__init,
+ NULL,NULL,NULL /* reserved[123] */
+};
diff --git a/engines/librpma_gpspm_flush.pb-c.h b/engines/librpma_gpspm_flush.pb-c.h
new file mode 100644
index 00000000..ad475a95
--- /dev/null
+++ b/engines/librpma_gpspm_flush.pb-c.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2020, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Generated by the protocol buffer compiler. DO NOT EDIT! */
+/* Generated from: librpma_gpspm_flush.proto */
+
+#ifndef PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED
+#define PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED
+
+#include <protobuf-c/protobuf-c.h>
+
+PROTOBUF_C__BEGIN_DECLS
+
+#if PROTOBUF_C_VERSION_NUMBER < 1000000
+# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
+#elif 1003003 < PROTOBUF_C_MIN_COMPILER_VERSION
+# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
+#endif
+
+
+typedef struct _GPSPMFlushRequest GPSPMFlushRequest;
+typedef struct _GPSPMFlushResponse GPSPMFlushResponse;
+
+
+/* --- enums --- */
+
+
+/* --- messages --- */
+
+struct _GPSPMFlushRequest
+{
+ ProtobufCMessage base;
+ uint64_t offset;
+ uint64_t length;
+ uint64_t op_context;
+};
+#define GPSPM_FLUSH_REQUEST__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&gpspm_flush_request__descriptor) \
+ , 0, 0, 0 }
+
+
+struct _GPSPMFlushResponse
+{
+ ProtobufCMessage base;
+ uint64_t op_context;
+};
+#define GPSPM_FLUSH_RESPONSE__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&gpspm_flush_response__descriptor) \
+ , 0 }
+
+
+/* GPSPMFlushRequest methods */
+void gpspm_flush_request__init
+ (GPSPMFlushRequest *message);
+size_t gpspm_flush_request__get_packed_size
+ (const GPSPMFlushRequest *message);
+size_t gpspm_flush_request__pack
+ (const GPSPMFlushRequest *message,
+ uint8_t *out);
+size_t gpspm_flush_request__pack_to_buffer
+ (const GPSPMFlushRequest *message,
+ ProtobufCBuffer *buffer);
+GPSPMFlushRequest *
+ gpspm_flush_request__unpack
+ (ProtobufCAllocator *allocator,
+ size_t len,
+ const uint8_t *data);
+void gpspm_flush_request__free_unpacked
+ (GPSPMFlushRequest *message,
+ ProtobufCAllocator *allocator);
+/* GPSPMFlushResponse methods */
+void gpspm_flush_response__init
+ (GPSPMFlushResponse *message);
+size_t gpspm_flush_response__get_packed_size
+ (const GPSPMFlushResponse *message);
+size_t gpspm_flush_response__pack
+ (const GPSPMFlushResponse *message,
+ uint8_t *out);
+size_t gpspm_flush_response__pack_to_buffer
+ (const GPSPMFlushResponse *message,
+ ProtobufCBuffer *buffer);
+GPSPMFlushResponse *
+ gpspm_flush_response__unpack
+ (ProtobufCAllocator *allocator,
+ size_t len,
+ const uint8_t *data);
+void gpspm_flush_response__free_unpacked
+ (GPSPMFlushResponse *message,
+ ProtobufCAllocator *allocator);
+/* --- per-message closures --- */
+
+typedef void (*GPSPMFlushRequest_Closure)
+ (const GPSPMFlushRequest *message,
+ void *closure_data);
+typedef void (*GPSPMFlushResponse_Closure)
+ (const GPSPMFlushResponse *message,
+ void *closure_data);
+
+/* --- services --- */
+
+
+/* --- descriptors --- */
+
+extern const ProtobufCMessageDescriptor gpspm_flush_request__descriptor;
+extern const ProtobufCMessageDescriptor gpspm_flush_response__descriptor;
+
+PROTOBUF_C__END_DECLS
+
+
+#endif /* PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED */
diff --git a/engines/librpma_gpspm_flush.proto b/engines/librpma_gpspm_flush.proto
new file mode 100644
index 00000000..91765a7f
--- /dev/null
+++ b/engines/librpma_gpspm_flush.proto
@@ -0,0 +1,15 @@
+syntax = "proto2";
+
+message GPSPM_flush_request {
+ /* an offset of a region to be flushed within its memory registration */
+ required fixed64 offset = 1;
+ /* a length of a region to be flushed */
+ required fixed64 length = 2;
+ /* a user-defined operation context */
+ required fixed64 op_context = 3;
+}
+
+message GPSPM_flush_response {
+ /* the operation context of a completed request */
+ required fixed64 op_context = 1;
+}
diff --git a/engines/libzbc.c b/engines/libzbc.c
index 4b900233..7f2bc431 100644
--- a/engines/libzbc.c
+++ b/engines/libzbc.c
@@ -19,6 +19,7 @@ struct libzbc_data {
struct zbc_device *zdev;
enum zbc_dev_model model;
uint64_t nr_sectors;
+ uint32_t max_open_seq_req;
};
static int libzbc_get_dev_info(struct libzbc_data *ld, struct fio_file *f)
@@ -32,6 +33,7 @@ static int libzbc_get_dev_info(struct libzbc_data *ld, struct fio_file *f)
zbc_get_device_info(ld->zdev, zinfo);
ld->model = zinfo->zbd_model;
ld->nr_sectors = zinfo->zbd_sectors;
+ ld->max_open_seq_req = zinfo->zbd_max_nr_open_seq_req;
dprint(FD_ZBD, "%s: vendor_id:%s, type: %s, model: %s\n",
f->file_name, zinfo->zbd_vendor_id,
@@ -86,7 +88,8 @@ static int libzbc_open_dev(struct thread_data *td, struct fio_file *f,
return -ENOMEM;
ret = zbc_open(f->file_name,
- flags | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA, &ld->zdev);
+ flags | ZBC_O_DRV_BLOCK | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA,
+ &ld->zdev);
if (ret) {
log_err("%s: zbc_open() failed, err=%d\n",
f->file_name, ret);
@@ -177,10 +180,8 @@ static int libzbc_get_zoned_model(struct thread_data *td, struct fio_file *f,
struct libzbc_data *ld;
int ret;
- if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) {
- *model = ZBD_IGNORE;
- return 0;
- }
+ if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR)
+ return -EINVAL;
ret = libzbc_open_dev(td, f, &ld);
if (ret)
@@ -283,7 +284,7 @@ static int libzbc_report_zones(struct thread_data *td, struct fio_file *f,
default:
/* Treat all these conditions as offline (don't use!) */
zbdz->cond = ZBD_ZONE_COND_OFFLINE;
- break;
+ zbdz->wp = zbdz->start;
}
}
@@ -334,6 +335,24 @@ err:
return -ret;
}
+static int libzbc_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ struct libzbc_data *ld;
+ int ret;
+
+ ret = libzbc_open_dev(td, f, &ld);
+ if (ret)
+ return ret;
+
+ if (ld->max_open_seq_req == ZBC_NO_LIMIT)
+ *max_open_zones = 0;
+ else
+ *max_open_zones = ld->max_open_seq_req;
+
+ return 0;
+}
+
ssize_t libzbc_rw(struct thread_data *td, struct io_u *io_u)
{
struct libzbc_data *ld = td->io_ops_data;
@@ -413,6 +432,7 @@ FIO_STATIC struct ioengine_ops ioengine = {
.get_zoned_model = libzbc_get_zoned_model,
.report_zones = libzbc_report_zones,
.reset_wp = libzbc_reset_wp,
+ .get_max_open_zones = libzbc_get_max_open_zones,
.queue = libzbc_queue,
.flags = FIO_SYNCIO | FIO_NOEXTEND | FIO_RAWIO,
};
diff --git a/engines/net.c b/engines/net.c
index 91f25774..c6cec584 100644
--- a/engines/net.c
+++ b/engines/net.c
@@ -938,8 +938,9 @@ static int fio_netio_udp_recv_open(struct thread_data *td, struct fio_file *f)
if (ntohl(msg.magic) != FIO_LINK_OPEN_CLOSE_MAGIC ||
ntohl(msg.cmd) != FIO_LINK_OPEN) {
- log_err("fio: bad udp open magic %x/%x\n", ntohl(msg.magic),
- ntohl(msg.cmd));
+ log_err("fio: bad udp open magic %x/%x\n",
+ (unsigned int) ntohl(msg.magic),
+ (unsigned int) ntohl(msg.cmd));
return -1;
}
diff --git a/engines/nfs.c b/engines/nfs.c
new file mode 100644
index 00000000..21be8833
--- /dev/null
+++ b/engines/nfs.c
@@ -0,0 +1,314 @@
+#include <stdlib.h>
+#include <poll.h>
+#include <nfsc/libnfs.h>
+#include <nfsc/libnfs-raw.h>
+#include <nfsc/libnfs-raw-mount.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+enum nfs_op_type {
+ NFS_READ_WRITE = 0,
+ NFS_STAT_MKDIR_RMDIR,
+ NFS_STAT_TOUCH_RM,
+};
+
+struct fio_libnfs_options {
+ struct nfs_context *context;
+ char *nfs_url;
+ unsigned int queue_depth; /* nfs_callback needs this info, but doesn't have fio td structure to pull it from */
+ /* the following implement a circular queue of outstanding IOs */
+ int outstanding_events; /* IOs issued to libnfs, that have not returned yet */
+ int prev_requested_event_index; /* event last returned via fio_libnfs_event */
+ int next_buffered_event; /* round robin-pointer within events[] */
+ int buffered_event_count; /* IOs completed by libnfs, waiting for FIO */
+ int free_event_buffer_index; /* next free buffer */
+ struct io_u**events;
+};
+
+struct nfs_data {
+ struct nfsfh *nfsfh;
+ struct fio_libnfs_options *options;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "nfs_url",
+ .lname = "nfs_url",
+ .type = FIO_OPT_STR_STORE,
+ .help = "URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]",
+ .off1 = offsetof(struct fio_libnfs_options, nfs_url),
+ .category = FIO_OPT_C_ENGINE,
+ .group = __FIO_OPT_G_NFS,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+static struct io_u *fio_libnfs_event(struct thread_data *td, int event)
+{
+ struct fio_libnfs_options *o = td->eo;
+ struct io_u *io_u = o->events[o->next_buffered_event];
+ assert(o->events[o->next_buffered_event]);
+ o->events[o->next_buffered_event] = NULL;
+ o->next_buffered_event = (o->next_buffered_event + 1) % td->o.iodepth;
+ /* validate our state machine */
+ assert(o->buffered_event_count);
+ o->buffered_event_count--;
+ assert(io_u);
+ /* assert that fio_libnfs_event is being called in sequential fashion */
+ assert(event == 0 || o->prev_requested_event_index + 1 == event);
+ if (o->buffered_event_count == 0) {
+ o->prev_requested_event_index = -1;
+ } else {
+ o->prev_requested_event_index = event;
+ }
+ return io_u;
+}
+
+static int nfs_event_loop(struct thread_data *td, bool flush) {
+ struct fio_libnfs_options *o = td->eo;
+ struct pollfd pfds[1]; /* nfs:0 */
+ /* we already have stuff queued for fio, no need to waste cpu on poll() */
+ if (o->buffered_event_count)
+ return o->buffered_event_count;
+ /* fio core logic seems to stop calling this event-loop if we ever return with 0 events */
+ #define SHOULD_WAIT() (o->outstanding_events == td->o.iodepth || (flush && o->outstanding_events))
+
+ do {
+ int timeout = SHOULD_WAIT() ? -1 : 0;
+ int ret = 0;
+ pfds[0].fd = nfs_get_fd(o->context);
+ pfds[0].events = nfs_which_events(o->context);
+ ret = poll(&pfds[0], 1, timeout);
+ if (ret < 0) {
+ if (errno == EINTR || errno == EAGAIN) {
+ continue;
+ }
+ log_err("nfs: failed to poll events: %s.\n",
+ strerror(errno));
+ break;
+ }
+
+ ret = nfs_service(o->context, pfds[0].revents);
+ if (ret < 0) {
+ log_err("nfs: socket is in an unrecoverable error state.\n");
+ break;
+ }
+ } while (SHOULD_WAIT());
+ return o->buffered_event_count;
+#undef SHOULD_WAIT
+}
+
+static int fio_libnfs_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ return nfs_event_loop(td, false);
+}
+
+static void nfs_callback(int res, struct nfs_context *nfs, void *data,
+ void *private_data)
+{
+ struct io_u *io_u = private_data;
+ struct nfs_data *nfs_data = io_u->file->engine_data;
+ struct fio_libnfs_options *o = nfs_data->options;
+ if (res < 0) {
+ log_err("Failed NFS operation(code:%d): %s\n", res, nfs_get_error(o->context));
+ io_u->error = -res;
+ /* res is used for read math below, don't wanna pass negative there */
+ res = 0;
+ } else if (io_u->ddir == DDIR_READ) {
+ memcpy(io_u->buf, data, res);
+ if (res == 0)
+ log_err("Got NFS EOF, this is probably not expected\n");
+ }
+ /* fio uses resid to track remaining data */
+ io_u->resid = io_u->xfer_buflen - res;
+
+ assert(!o->events[o->free_event_buffer_index]);
+ o->events[o->free_event_buffer_index] = io_u;
+ o->free_event_buffer_index = (o->free_event_buffer_index + 1) % o->queue_depth;
+ o->outstanding_events--;
+ o->buffered_event_count++;
+}
+
+static int queue_write(struct fio_libnfs_options *o, struct io_u *io_u) {
+ struct nfs_data *nfs_data = io_u->engine_data;
+ return nfs_pwrite_async(o->context, nfs_data->nfsfh,
+ io_u->offset, io_u->buflen, io_u->buf, nfs_callback,
+ io_u);
+}
+
+static int queue_read(struct fio_libnfs_options *o, struct io_u *io_u) {
+ struct nfs_data *nfs_data = io_u->engine_data;
+ return nfs_pread_async(o->context, nfs_data->nfsfh, io_u->offset, io_u->buflen, nfs_callback, io_u);
+}
+
+static enum fio_q_status fio_libnfs_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct nfs_data *nfs_data = io_u->file->engine_data;
+ struct fio_libnfs_options *o = nfs_data->options;
+ struct nfs_context *nfs = o->context;
+ int err;
+ enum fio_q_status ret = FIO_Q_QUEUED;
+
+ io_u->engine_data = nfs_data;
+ switch(io_u->ddir) {
+ case DDIR_WRITE:
+ err = queue_write(o, io_u);
+ break;
+ case DDIR_READ:
+ err = queue_read(o, io_u);
+ break;
+ case DDIR_TRIM:
+ log_err("nfs: trim is not supported");
+ err = -1;
+ break;
+ default:
+ log_err("nfs: unhandled io %d\n", io_u->ddir);
+ err = -1;
+ }
+ if (err) {
+ log_err("nfs: Failed to queue nfs op: %s\n", nfs_get_error(nfs));
+ td->error = 1;
+ return FIO_Q_COMPLETED;
+ }
+ o->outstanding_events++;
+ return ret;
+}
+
+/*
+ * Do a mount if one has not been done before
+ */
+static int do_mount(struct thread_data *td, const char *url)
+{
+ size_t event_size = sizeof(struct io_u **) * td->o.iodepth;
+ struct fio_libnfs_options *options = td->eo;
+ struct nfs_url *nfs_url = NULL;
+ int ret = 0;
+ int path_len = 0;
+ char *mnt_dir = NULL;
+
+ if (options->context)
+ return 0;
+
+ options->context = nfs_init_context();
+ if (options->context == NULL) {
+ log_err("nfs: failed to init nfs context\n");
+ return -1;
+ }
+
+ options->events = malloc(event_size);
+ memset(options->events, 0, event_size);
+
+ options->prev_requested_event_index = -1;
+ options->queue_depth = td->o.iodepth;
+
+ nfs_url = nfs_parse_url_full(options->context, url);
+ path_len = strlen(nfs_url->path);
+ mnt_dir = malloc(path_len + strlen(nfs_url->file) + 1);
+ strcpy(mnt_dir, nfs_url->path);
+ strcpy(mnt_dir + strlen(nfs_url->path), nfs_url->file);
+ ret = nfs_mount(options->context, nfs_url->server, mnt_dir);
+ free(mnt_dir);
+ nfs_destroy_url(nfs_url);
+ return ret;
+}
+
+static int fio_libnfs_setup(struct thread_data *td)
+{
+ /* Using threads with libnfs causes fio to hang on exit, lower performance */
+ td->o.use_thread = 0;
+ return 0;
+}
+
+static void fio_libnfs_cleanup(struct thread_data *td)
+{
+ struct fio_libnfs_options *o = td->eo;
+ nfs_umount(o->context);
+ nfs_destroy_context(o->context);
+ free(o->events);
+}
+
+static int fio_libnfs_open(struct thread_data *td, struct fio_file *f)
+{
+ int ret;
+ struct fio_libnfs_options *options = td->eo;
+ struct nfs_data *nfs_data = NULL;
+ int flags = 0;
+
+ if (!options->nfs_url) {
+ log_err("nfs: nfs_url is a required parameter\n");
+ return -1;
+ }
+
+ ret = do_mount(td, options->nfs_url);
+
+ if (ret != 0) {
+ log_err("nfs: Failed to mount %s with code %d: %s\n", options->nfs_url, ret, nfs_get_error(options->context));
+ return ret;
+ }
+ nfs_data = malloc(sizeof(struct nfs_data));
+ memset(nfs_data, 0, sizeof(struct nfs_data));
+ nfs_data->options = options;
+
+ if (td->o.td_ddir == TD_DDIR_WRITE) {
+ flags |= O_CREAT | O_RDWR;
+ } else {
+ flags |= O_RDWR;
+ }
+ ret = nfs_open(options->context, f->file_name, flags, &nfs_data->nfsfh);
+
+ if (ret != 0)
+ log_err("Failed to open %s: %s\n", f->file_name, nfs_get_error(options->context));
+ f->engine_data = nfs_data;
+ return ret;
+}
+
+static int fio_libnfs_close(struct thread_data *td, struct fio_file *f)
+{
+ struct nfs_data *nfs_data = f->engine_data;
+ struct fio_libnfs_options *o = nfs_data->options;
+ int ret = 0;
+ if (nfs_data->nfsfh)
+ ret = nfs_close(o->context, nfs_data->nfsfh);
+ free(nfs_data);
+ f->engine_data = NULL;
+ return ret;
+}
+
+/*
+ * Hook for writing out outstanding data.
+ */
+static int fio_libnfs_commit(struct thread_data *td) {
+ nfs_event_loop(td, true);
+ return 0;
+}
+
+struct ioengine_ops ioengine = {
+ .name = "nfs",
+ .version = FIO_IOOPS_VERSION,
+ .setup = fio_libnfs_setup,
+ .queue = fio_libnfs_queue,
+ .getevents = fio_libnfs_getevents,
+ .event = fio_libnfs_event,
+ .cleanup = fio_libnfs_cleanup,
+ .open_file = fio_libnfs_open,
+ .close_file = fio_libnfs_close,
+ .commit = fio_libnfs_commit,
+ .flags = FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+ .options = options,
+ .option_struct_size = sizeof(struct fio_libnfs_options),
+};
+
+static void fio_init fio_nfs_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_nfs_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
diff --git a/engines/posixaio.c b/engines/posixaio.c
index 82c6aa65..135d088c 100644
--- a/engines/posixaio.c
+++ b/engines/posixaio.c
@@ -17,47 +17,14 @@ struct posixaio_data {
unsigned int queued;
};
-static int fill_timespec(struct timespec *ts)
+static unsigned long long ts_utime_since_now(const struct timespec *start)
{
-#ifdef CONFIG_CLOCK_GETTIME
-#ifdef CONFIG_CLOCK_MONOTONIC
- clockid_t clk = CLOCK_MONOTONIC;
-#else
- clockid_t clk = CLOCK_REALTIME;
-#endif
- if (!clock_gettime(clk, ts))
- return 0;
-
- perror("clock_gettime");
- return 1;
-#else
- struct timeval tv;
-
- gettimeofday(&tv, NULL);
- ts->tv_sec = tv.tv_sec;
- ts->tv_nsec = tv.tv_usec * 1000;
- return 0;
-#endif
-}
-
-static unsigned long long ts_utime_since_now(struct timespec *t)
-{
- long long sec, nsec;
struct timespec now;
- if (fill_timespec(&now))
+ if (fio_get_mono_time(&now) < 0)
return 0;
-
- sec = now.tv_sec - t->tv_sec;
- nsec = now.tv_nsec - t->tv_nsec;
- if (sec > 0 && nsec < 0) {
- sec--;
- nsec += 1000000000;
- }
- sec *= 1000000;
- nsec /= 1000;
- return sec + nsec;
+ return utime_since(start, &now);
}
static int fio_posixaio_cancel(struct thread_data fio_unused *td,
@@ -102,7 +69,7 @@ static int fio_posixaio_getevents(struct thread_data *td, unsigned int min,
unsigned int r;
int i;
- if (t && !fill_timespec(&start))
+ if (t && fio_get_mono_time(&start) == 0)
have_timeout = 1;
else
memset(&start, 0, sizeof(start));
diff --git a/engines/rados.c b/engines/rados.c
index 42ee48ff..23e62c4c 100644
--- a/engines/rados.c
+++ b/engines/rados.c
@@ -38,6 +38,7 @@ struct rados_options {
char *pool_name;
char *client_name;
int busy_poll;
+ int touch_objects;
};
static struct fio_option options[] = {
@@ -79,6 +80,16 @@ static struct fio_option options[] = {
.group = FIO_OPT_G_RBD,
},
{
+ .name = "touch_objects",
+ .lname = "touch objects on start",
+ .type = FIO_OPT_BOOL,
+ .help = "Touch (create) objects on start",
+ .off1 = offsetof(struct rados_options, touch_objects),
+ .def = "1",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_RBD,
+ },
+ {
.name = NULL,
},
};
@@ -194,9 +205,11 @@ static int _fio_rados_connect(struct thread_data *td)
for (i = 0; i < td->o.nr_files; i++) {
f = td->files[i];
f->real_file_size = file_size;
- r = rados_write(rados->io_ctx, f->file_name, "", 0, 0);
- if (r < 0) {
- goto failed_obj_create;
+ if (o->touch_objects) {
+ r = rados_write(rados->io_ctx, f->file_name, "", 0, 0);
+ if (r < 0) {
+ goto failed_obj_create;
+ }
}
}
return 0;
diff --git a/engines/rbd.c b/engines/rbd.c
index 268b6ebd..c6203d4c 100644
--- a/engines/rbd.c
+++ b/engines/rbd.c
@@ -227,12 +227,30 @@ static int _fio_rbd_connect(struct thread_data *td)
goto failed_shutdown;
}
+ if (td->o.odirect) {
+ r = rados_conf_set(rbd->cluster, "rbd_cache", "false");
+ if (r < 0) {
+ log_info("failed to disable RBD in-memory cache\n");
+ }
+ }
+
r = rbd_open(rbd->io_ctx, o->rbd_name, &rbd->image, NULL /*snap */ );
if (r < 0) {
log_err("rbd_open failed.\n");
goto failed_open;
}
+ if (!td->o.odirect) {
+ /*
+ * ensure cache enables writeback/around mode unless explicitly
+ * configured for writethrough mode
+ */
+ r = rbd_flush(rbd->image);
+ if (r < 0) {
+ log_info("rbd: failed to issue initial flush\n");
+ }
+ }
+
if (!_fio_rbd_setup_poll(rbd))
goto failed_poll;
diff --git a/engines/sg.c b/engines/sg.c
index a1a6de4c..0c2d2c8b 100644
--- a/engines/sg.c
+++ b/engines/sg.c
@@ -60,6 +60,10 @@
#ifdef FIO_HAVE_SGIO
+#ifndef SGV4_FLAG_HIPRI
+#define SGV4_FLAG_HIPRI 0x800
+#endif
+
enum {
FIO_SG_WRITE = 1,
FIO_SG_WRITE_VERIFY = 2,
@@ -68,12 +72,22 @@ enum {
struct sg_options {
void *pad;
+ unsigned int hipri;
unsigned int readfua;
unsigned int writefua;
unsigned int write_mode;
};
static struct fio_option options[] = {
+ {
+ .name = "hipri",
+ .lname = "High Priority",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct sg_options, hipri),
+ .help = "Use polled IO completions",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_SG,
+ },
{
.name = "readfua",
.lname = "sg engine read fua flag support",
@@ -527,6 +541,8 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
else
hdr->cmdp[0] = 0x88; // read(16)
+ if (o->hipri)
+ hdr->flags |= SGV4_FLAG_HIPRI;
if (o->readfua)
hdr->cmdp[1] |= 0x08;
@@ -542,6 +558,8 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
hdr->cmdp[0] = 0x2a; // write(10)
else
hdr->cmdp[0] = 0x8a; // write(16)
+ if (o->hipri)
+ hdr->flags |= SGV4_FLAG_HIPRI;
if (o->writefua)
hdr->cmdp[1] |= 0x08;
break;
@@ -865,6 +883,7 @@ static int fio_sgio_init(struct thread_data *td)
{
struct sgio_data *sd;
struct sgio_trim *st;
+ struct sg_io_hdr *h3p;
int i;
sd = calloc(1, sizeof(*sd));
@@ -880,12 +899,13 @@ static int fio_sgio_init(struct thread_data *td)
#ifdef FIO_SGIO_DEBUG
sd->trim_queue_map = calloc(td->o.iodepth, sizeof(int));
#endif
- for (i = 0; i < td->o.iodepth; i++) {
+ for (i = 0, h3p = sd->sgbuf; i < td->o.iodepth; i++, ++h3p) {
sd->trim_queues[i] = calloc(1, sizeof(struct sgio_trim));
st = sd->trim_queues[i];
st->unmap_param = calloc(td->o.iodepth + 1, sizeof(char[16]));
st->unmap_range_count = 0;
st->trim_io_us = calloc(td->o.iodepth, sizeof(struct io_u *));
+ h3p->interface_id = 'S';
}
td->io_ops_data = sd;
diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c
index 7f3e4cb3..cff83a10 100644
--- a/engines/skeleton_external.c
+++ b/engines/skeleton_external.c
@@ -156,7 +156,6 @@ static int fio_skeleton_close(struct thread_data *td, struct fio_file *f)
/*
* Hook for getting the zoned model of a zoned block device for zonemode=zbd.
* The zoned model can be one of (see zbd_types.h):
- * - ZBD_IGNORE: skip regular files
* - ZBD_NONE: regular block device (zone emulation will be used)
* - ZBD_HOST_AWARE: host aware zoned block device
* - ZBD_HOST_MANAGED: host managed zoned block device
@@ -194,6 +193,18 @@ static int fio_skeleton_reset_wp(struct thread_data *td, struct fio_file *f,
}
/*
+ * Hook called for getting the maximum number of open zones for a
+ * ZBD_HOST_MANAGED zoned block device.
+ * A @max_open_zones value set to zero means no limit.
+ */
+static int fio_skeleton_get_max_open_zones(struct thread_data *td,
+ struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ return 0;
+}
+
+/*
* Note that the structure is exported, so that fio can get it via
* dlsym(..., "ioengine"); for (and only for) external engines.
*/
@@ -212,6 +223,7 @@ struct ioengine_ops ioengine = {
.get_zoned_model = fio_skeleton_get_zoned_model,
.report_zones = fio_skeleton_report_zones,
.reset_wp = fio_skeleton_reset_wp,
+ .get_max_open_zones = fio_skeleton_get_max_open_zones,
.options = options,
.option_struct_size = sizeof(struct fio_skeleton_options),
};
diff --git a/engines/windowsaio.c b/engines/windowsaio.c
index 5c7e7964..9868e816 100644
--- a/engines/windowsaio.c
+++ b/engines/windowsaio.c
@@ -161,15 +161,15 @@ static int windowsaio_invalidate_cache(struct fio_file *f)
if (ihFile != INVALID_HANDLE_VALUE) {
if (!CloseHandle(ihFile)) {
error = GetLastError();
- log_info("windowsaio: invalidation fd close %s "
- "failed: error %d\n", f->file_name, error);
+ log_info("windowsaio: invalidation fd close %s failed: error %lu\n",
+ f->file_name, error);
rc = 1;
}
} else {
error = GetLastError();
if (error != ERROR_FILE_NOT_FOUND) {
- log_info("windowsaio: cache invalidation of %s failed: "
- "error %d\n", f->file_name, error);
+ log_info("windowsaio: cache invalidation of %s failed: error %lu\n",
+ f->file_name, error);
rc = 1;
}
}
diff --git a/eta.c b/eta.c
index e8c72780..db13cb18 100644
--- a/eta.c
+++ b/eta.c
@@ -331,7 +331,7 @@ static void calc_rate(int unified_rw_rep, unsigned long mtime,
else
this_rate = 0;
- if (unified_rw_rep) {
+ if (unified_rw_rep == UNIFIED_MIXED) {
rate[i] = 0;
rate[0] += this_rate;
} else
@@ -356,7 +356,7 @@ static void calc_iops(int unified_rw_rep, unsigned long mtime,
else
this_iops = 0;
- if (unified_rw_rep) {
+ if (unified_rw_rep == UNIFIED_MIXED) {
iops[i] = 0;
iops[0] += this_iops;
} else
@@ -507,6 +507,7 @@ bool calc_thread_status(struct jobs_eta *je, int force)
calc_rate(unified_rw_rep, rate_time, io_bytes, rate_io_bytes,
je->rate);
memcpy(&rate_prev_time, &now, sizeof(now));
+ regrow_agg_logs();
for_each_rw_ddir(ddir) {
add_agg_sample(sample_val(je->rate[ddir]), ddir, 0, 0);
}
@@ -534,56 +535,38 @@ bool calc_thread_status(struct jobs_eta *je, int force)
static int gen_eta_str(struct jobs_eta *je, char *p, size_t left,
char **rate_str, char **iops_str)
{
- bool has_r = je->rate[DDIR_READ] || je->iops[DDIR_READ];
- bool has_w = je->rate[DDIR_WRITE] || je->iops[DDIR_WRITE];
- bool has_t = je->rate[DDIR_TRIM] || je->iops[DDIR_TRIM];
+ static const char c[DDIR_RWDIR_CNT] = {'r', 'w', 't'};
+ bool has[DDIR_RWDIR_CNT];
+ bool has_any = false;
+ const char *sep;
int l = 0;
- if (!has_r && !has_w && !has_t)
+ for_each_rw_ddir(ddir) {
+ has[ddir] = (je->rate[ddir] || je->iops[ddir]);
+ has_any |= has[ddir];
+ }
+ if (!has_any)
return 0;
- if (has_r) {
- l += snprintf(p + l, left - l, "[r=%s", rate_str[DDIR_READ]);
- if (!has_w)
- l += snprintf(p + l, left - l, "]");
- }
- if (has_w) {
- if (has_r)
- l += snprintf(p + l, left - l, ",");
- else
- l += snprintf(p + l, left - l, "[");
- l += snprintf(p + l, left - l, "w=%s", rate_str[DDIR_WRITE]);
- if (!has_t)
- l += snprintf(p + l, left - l, "]");
- }
- if (has_t) {
- if (has_r || has_w)
- l += snprintf(p + l, left - l, ",");
- else if (!has_r && !has_w)
- l += snprintf(p + l, left - l, "[");
- l += snprintf(p + l, left - l, "t=%s]", rate_str[DDIR_TRIM]);
- }
- if (has_r) {
- l += snprintf(p + l, left - l, "[r=%s", iops_str[DDIR_READ]);
- if (!has_w)
- l += snprintf(p + l, left - l, " IOPS]");
- }
- if (has_w) {
- if (has_r)
- l += snprintf(p + l, left - l, ",");
- else
- l += snprintf(p + l, left - l, "[");
- l += snprintf(p + l, left - l, "w=%s", iops_str[DDIR_WRITE]);
- if (!has_t)
- l += snprintf(p + l, left - l, " IOPS]");
+ l += snprintf(p + l, left - l, "[");
+ sep = "";
+ for_each_rw_ddir(ddir) {
+ if (has[ddir]) {
+ l += snprintf(p + l, left - l, "%s%c=%s",
+ sep, c[ddir], rate_str[ddir]);
+ sep = ",";
+ }
}
- if (has_t) {
- if (has_r || has_w)
- l += snprintf(p + l, left - l, ",");
- else if (!has_r && !has_w)
- l += snprintf(p + l, left - l, "[");
- l += snprintf(p + l, left - l, "t=%s IOPS]", iops_str[DDIR_TRIM]);
+ l += snprintf(p + l, left - l, "][");
+ sep = "";
+ for_each_rw_ddir(ddir) {
+ if (has[ddir]) {
+ l += snprintf(p + l, left - l, "%s%c=%s",
+ sep, c[ddir], iops_str[ddir]);
+ sep = ",";
+ }
}
+ l += snprintf(p + l, left - l, " IOPS]");
return l;
}
diff --git a/examples/1mbs_clients.png b/examples/1mbs_clients.png
new file mode 100644
index 00000000..3f972dc6
--- /dev/null
+++ b/examples/1mbs_clients.png
Binary files differ
diff --git a/examples/aio-read.png b/examples/aio-read.png
new file mode 100644
index 00000000..e0c020a5
--- /dev/null
+++ b/examples/aio-read.png
Binary files differ
diff --git a/examples/backwards-read.png b/examples/backwards-read.png
new file mode 100644
index 00000000..81dc9208
--- /dev/null
+++ b/examples/backwards-read.png
Binary files differ
diff --git a/examples/basic-verify.png b/examples/basic-verify.png
new file mode 100644
index 00000000..98f73020
--- /dev/null
+++ b/examples/basic-verify.png
Binary files differ
diff --git a/examples/butterfly.png b/examples/butterfly.png
new file mode 100644
index 00000000..2c566512
--- /dev/null
+++ b/examples/butterfly.png
Binary files differ
diff --git a/examples/cpp_null.fio b/examples/cpp_null.fio
index 436ed90a..7c62beaf 100644
--- a/examples/cpp_null.fio
+++ b/examples/cpp_null.fio
@@ -7,4 +7,4 @@ ioengine=cpp_null
size=100g
rw=randread
norandommap
-time_based=0
+time_based
diff --git a/examples/cpp_null.png b/examples/cpp_null.png
new file mode 100644
index 00000000..5303ac2a
--- /dev/null
+++ b/examples/cpp_null.png
Binary files differ
diff --git a/examples/cpuio.fio b/examples/cpuio.fio
index 577e0729..471cf4b2 100644
--- a/examples/cpuio.fio
+++ b/examples/cpuio.fio
@@ -1,8 +1,18 @@
[global]
ioengine=cpuio
time_based
-runtime=10
+runtime=15
-[burn50percent]
+# The following example load 2 cores at 50% with the noop (default) mode
+[burn_2x50_noop]
cpuload=50
+numjobs=2
+cpumode=noop
+# Once burn_2x50_noop is over,
+# fio load 2 cores at 50% with the qsort mode which drains much more power
+[burn_2x50%_qsort]
+stonewall
+cpuload=50
+numjobs=2
+cpumode=qsort
diff --git a/examples/cpuio.png b/examples/cpuio.png
new file mode 100644
index 00000000..02938dbb
--- /dev/null
+++ b/examples/cpuio.png
Binary files differ
diff --git a/examples/cross-stripe-verify.fio b/examples/cross-stripe-verify.fio
index 68664ed0..47c0889c 100644
--- a/examples/cross-stripe-verify.fio
+++ b/examples/cross-stripe-verify.fio
@@ -17,7 +17,7 @@ verify_backlog=1
offset_increment=124g
io_size=120g
offset=120k
-group_reporting=1
+group_reporting
verify_dump=1
loops=2
diff --git a/examples/cross-stripe-verify.png b/examples/cross-stripe-verify.png
new file mode 100644
index 00000000..90aa630f
--- /dev/null
+++ b/examples/cross-stripe-verify.png
Binary files differ
diff --git a/examples/dev-dax.fio b/examples/dev-dax.fio
index d9f430eb..88bce31b 100644
--- a/examples/dev-dax.fio
+++ b/examples/dev-dax.fio
@@ -2,7 +2,7 @@
bs=2m
ioengine=dev-dax
norandommap
-time_based=1
+time_based
runtime=30
group_reporting
disable_lat=1
@@ -18,7 +18,7 @@ cpus_allowed_policy=split
#
iodepth=1
direct=0
-thread=1
+thread
numjobs=16
#
# The dev-dax engine does IO to DAX device that are special character
diff --git a/examples/dev-dax.png b/examples/dev-dax.png
new file mode 100644
index 00000000..2463bca3
--- /dev/null
+++ b/examples/dev-dax.png
Binary files differ
diff --git a/examples/dfs.fio b/examples/dfs.fio
new file mode 100644
index 00000000..5de887d7
--- /dev/null
+++ b/examples/dfs.fio
@@ -0,0 +1,33 @@
+[global]
+ioengine=dfs
+pool=${POOL}
+cont=${CONT}
+filename_format=fio-test.$jobnum
+
+cpus_allowed_policy=split
+group_reporting=1
+time_based=0
+percentile_list=99.0:99.9:99.99:99.999:99.9999:100
+disable_slat=1
+disable_clat=1
+
+bs=1M
+size=100G
+iodepth=16
+numjobs=16
+
+[daos-seqwrite]
+rw=write
+stonewall
+
+[daos-seqread]
+rw=read
+stonewall
+
+[daos-randwrite]
+rw=randwrite
+stonewall
+
+[daos-randread]
+rw=randread
+stonewall
diff --git a/examples/dfs.png b/examples/dfs.png
new file mode 100644
index 00000000..049ccaec
--- /dev/null
+++ b/examples/dfs.png
Binary files differ
diff --git a/examples/disk-zone-profile.png b/examples/disk-zone-profile.png
new file mode 100644
index 00000000..5f7b24c9
--- /dev/null
+++ b/examples/disk-zone-profile.png
Binary files differ
diff --git a/examples/e4defrag.fio b/examples/e4defrag.fio
index cb94e85a..d6495f7a 100644
--- a/examples/e4defrag.fio
+++ b/examples/e4defrag.fio
@@ -18,7 +18,7 @@ rw=write
# Run e4defrag and aio-dio workers in parallel
[e4defrag]
stonewall
-time_based=30
+time_based
runtime=30
ioengine=e4defrag
buffered=0
diff --git a/examples/e4defrag.png b/examples/e4defrag.png
new file mode 100644
index 00000000..00a7fefd
--- /dev/null
+++ b/examples/e4defrag.png
Binary files differ
diff --git a/examples/e4defrag2.fio b/examples/e4defrag2.fio
index c6485997..86554ef7 100644
--- a/examples/e4defrag2.fio
+++ b/examples/e4defrag2.fio
@@ -48,14 +48,14 @@ donorname=file.def
########
# Run random e4defrag and various aio workers in parallel
-[e4defrag-fuzzer-4k]
+[e4defrag-fuzzer-4k-bis]
stonewall
continue_on_error=all
inplace=1
bs=4k
donorname=file3.def
filename=file3
-time_based=30
+time_based
rw=randwrite
[buffered-aio-32k]
@@ -68,7 +68,7 @@ bs=32k
filename=file3
rw=randrw
runtime=30
-time_based=30
+time_based
numjobs=4
[direct-aio-32k]
@@ -82,7 +82,6 @@ bs=32k
filename=file3
rw=randrw
runtime=30
-time_based=30
numjobs=4
diff --git a/examples/e4defrag2.png b/examples/e4defrag2.png
new file mode 100644
index 00000000..8a128e95
--- /dev/null
+++ b/examples/e4defrag2.png
Binary files differ
diff --git a/examples/enospc-pressure.png b/examples/enospc-pressure.png
new file mode 100644
index 00000000..da28b7c0
--- /dev/null
+++ b/examples/enospc-pressure.png
Binary files differ
diff --git a/examples/exec.fio b/examples/exec.fio
new file mode 100644
index 00000000..ac1bedfb
--- /dev/null
+++ b/examples/exec.fio
@@ -0,0 +1,36 @@
+[global]
+time_based
+runtime=30
+
+[monitoring_noop]
+ioengine=exec
+program=/usr/sbin/turbostat
+arguments=-c package -qS --interval 5 -s Busy%,Bzy_MHz,Avg_MHz,CorWatt,PkgWatt,RAMWatt,PkgTmp
+
+[cpuload_noop]
+ioengine=cpuio
+cpuload=100
+numjobs=12
+cpumode=noop
+
+[sleep]
+# Let the processor cooling down for a few seconds
+stonewall
+ioengine=exec
+runtime=10
+program=/bin/sleep
+arguments=%r
+grace_time=0
+std_redirect=0
+
+[monitoring_qsort]
+stonewall
+ioengine=exec
+program=/usr/sbin/turbostat
+arguments=-c package -qS --interval 5 -s Busy%,Bzy_MHz,Avg_MHz,CorWatt,PkgWatt,RAMWatt,PkgTmp
+
+[cpuload_qsort]
+ioengine=cpuio
+cpuload=100
+numjobs=12
+cpumode=qsort
diff --git a/examples/exec.png b/examples/exec.png
new file mode 100644
index 00000000..5f9f3b59
--- /dev/null
+++ b/examples/exec.png
Binary files differ
diff --git a/examples/exitwhat.fio b/examples/exitwhat.fio
index c91d7375..864508c6 100644
--- a/examples/exitwhat.fio
+++ b/examples/exitwhat.fio
@@ -11,7 +11,7 @@
filename=/tmp/test
filesize=1G
blocksize=4096
-group_reporting=1
+group_reporting
exitall=1
[slow1]
diff --git a/examples/exitwhat.png b/examples/exitwhat.png
new file mode 100644
index 00000000..9fc1883f
--- /dev/null
+++ b/examples/exitwhat.png
Binary files differ
diff --git a/examples/falloc.fio b/examples/falloc.fio
index fa307314..fadf1321 100644
--- a/examples/falloc.fio
+++ b/examples/falloc.fio
@@ -15,7 +15,7 @@ group_reporting
[falloc-fuzzer]
stonewall
runtime=10
-time_based=10
+time_based
bssplit=4k/10:64k/50:32k/40
rw=randwrite
numjobs=1
@@ -24,7 +24,7 @@ filename=fragmented_file
[punch hole-fuzzer]
bs=4k
runtime=10
-time_based=10
+time_based
rw=randtrim
numjobs=2
filename=fragmented_file
diff --git a/examples/falloc.png b/examples/falloc.png
new file mode 100644
index 00000000..886be22e
--- /dev/null
+++ b/examples/falloc.png
Binary files differ
diff --git a/examples/filecreate-ioengine.png b/examples/filecreate-ioengine.png
new file mode 100644
index 00000000..45d11da3
--- /dev/null
+++ b/examples/filecreate-ioengine.png
Binary files differ
diff --git a/examples/filedelete-ioengine.fio b/examples/filedelete-ioengine.fio
new file mode 100644
index 00000000..3c0028f9
--- /dev/null
+++ b/examples/filedelete-ioengine.fio
@@ -0,0 +1,18 @@
+# Example filedelete job
+
+# 'filedelete' engine only do 'unlink(filename)', file will not be open().
+# 'filesize' must be set, then files will be created at setup stage.
+# 'unlink' is better set to 0, since the file is deleted in measurement.
+# the options disabled completion latency output such as 'disable_clat' and 'gtod_reduce' must not set.
+[global]
+ioengine=filedelete
+filesize=4k
+nrfiles=200
+unlink=0
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
diff --git a/examples/filedelete-ioengine.png b/examples/filedelete-ioengine.png
new file mode 100644
index 00000000..3512ab71
--- /dev/null
+++ b/examples/filedelete-ioengine.png
Binary files differ
diff --git a/examples/filestat-ioengine.png b/examples/filestat-ioengine.png
new file mode 100644
index 00000000..bed59ab9
--- /dev/null
+++ b/examples/filestat-ioengine.png
Binary files differ
diff --git a/examples/fio-rand-RW.fio b/examples/fio-rand-RW.fio
index 0df0bc17..a1074a1a 100644
--- a/examples/fio-rand-RW.fio
+++ b/examples/fio-rand-RW.fio
@@ -9,7 +9,7 @@ rwmixwrite=40
bs=4K
direct=0
numjobs=4
-time_based=1
+time_based
runtime=900
[file1]
diff --git a/examples/fio-rand-RW.png b/examples/fio-rand-RW.png
new file mode 100644
index 00000000..aa4b0998
--- /dev/null
+++ b/examples/fio-rand-RW.png
Binary files differ
diff --git a/examples/fio-rand-read.fio b/examples/fio-rand-read.fio
index bc154668..319a9209 100644
--- a/examples/fio-rand-read.fio
+++ b/examples/fio-rand-read.fio
@@ -7,7 +7,7 @@ rw=randread
bs=4K
direct=0
numjobs=1
-time_based=1
+time_based
runtime=900
[file1]
diff --git a/examples/fio-rand-read.png b/examples/fio-rand-read.png
new file mode 100644
index 00000000..d45664a4
--- /dev/null
+++ b/examples/fio-rand-read.png
Binary files differ
diff --git a/examples/fio-rand-write.fio b/examples/fio-rand-write.fio
index bd1b73a9..55ededbd 100644
--- a/examples/fio-rand-write.fio
+++ b/examples/fio-rand-write.fio
@@ -7,7 +7,7 @@ rw=randwrite
bs=4K
direct=0
numjobs=4
-time_based=1
+time_based
runtime=900
[file1]
diff --git a/examples/fio-rand-write.png b/examples/fio-rand-write.png
new file mode 100644
index 00000000..10e068bc
--- /dev/null
+++ b/examples/fio-rand-write.png
Binary files differ
diff --git a/examples/fio-seq-RW.fio b/examples/fio-seq-RW.fio
index 8f7090f3..89e5c679 100644
--- a/examples/fio-seq-RW.fio
+++ b/examples/fio-seq-RW.fio
@@ -9,7 +9,7 @@ rwmixwrite=40
bs=256K
direct=0
numjobs=4
-time_based=1
+time_based
runtime=900
[file1]
diff --git a/examples/fio-seq-RW.png b/examples/fio-seq-RW.png
new file mode 100644
index 00000000..a2be35ec
--- /dev/null
+++ b/examples/fio-seq-RW.png
Binary files differ
diff --git a/examples/fio-seq-read.fio b/examples/fio-seq-read.fio
index 28de93c8..2b272480 100644
--- a/examples/fio-seq-read.fio
+++ b/examples/fio-seq-read.fio
@@ -5,7 +5,7 @@ rw=read
bs=256K
direct=1
numjobs=1
-time_based=1
+time_based
runtime=900
[file1]
diff --git a/examples/fio-seq-read.png b/examples/fio-seq-read.png
new file mode 100644
index 00000000..cf8f2978
--- /dev/null
+++ b/examples/fio-seq-read.png
Binary files differ
diff --git a/examples/fio-seq-write.fio b/examples/fio-seq-write.fio
index b291a15a..ac6c9eef 100644
--- a/examples/fio-seq-write.fio
+++ b/examples/fio-seq-write.fio
@@ -7,7 +7,7 @@ rw=write
bs=256K
direct=0
numjobs=1
-time_based=1
+time_based
runtime=900
[file1]
diff --git a/examples/fio-seq-write.png b/examples/fio-seq-write.png
new file mode 100644
index 00000000..8db12092
--- /dev/null
+++ b/examples/fio-seq-write.png
Binary files differ
diff --git a/examples/fixed-rate-submission.png b/examples/fixed-rate-submission.png
new file mode 100644
index 00000000..86ca9b3e
--- /dev/null
+++ b/examples/fixed-rate-submission.png
Binary files differ
diff --git a/examples/flow.png b/examples/flow.png
new file mode 100644
index 00000000..26a3d34c
--- /dev/null
+++ b/examples/flow.png
Binary files differ
diff --git a/examples/fsx.fio b/examples/fsx.fio
index 6b48c6fd..22152dc0 100644
--- a/examples/fsx.fio
+++ b/examples/fsx.fio
@@ -9,4 +9,3 @@ bs=4k
norandommap
direct=1
loops=500000
-rwmixcycle=40
diff --git a/examples/fsx.png b/examples/fsx.png
new file mode 100644
index 00000000..b4e13c80
--- /dev/null
+++ b/examples/fsx.png
Binary files differ
diff --git a/examples/ftruncate.png b/examples/ftruncate.png
new file mode 100644
index 00000000..b98895f6
--- /dev/null
+++ b/examples/ftruncate.png
Binary files differ
diff --git a/examples/gfapi.png b/examples/gfapi.png
new file mode 100644
index 00000000..acc6a6ae
--- /dev/null
+++ b/examples/gfapi.png
Binary files differ
diff --git a/examples/gpudirect-rdmaio-client.png b/examples/gpudirect-rdmaio-client.png
new file mode 100644
index 00000000..eac79858
--- /dev/null
+++ b/examples/gpudirect-rdmaio-client.png
Binary files differ
diff --git a/examples/gpudirect-rdmaio-server.png b/examples/gpudirect-rdmaio-server.png
new file mode 100644
index 00000000..e043d7c0
--- /dev/null
+++ b/examples/gpudirect-rdmaio-server.png
Binary files differ
diff --git a/examples/http-s3.png b/examples/http-s3.png
new file mode 100644
index 00000000..2021e85e
--- /dev/null
+++ b/examples/http-s3.png
Binary files differ
diff --git a/examples/http-swift.png b/examples/http-swift.png
new file mode 100644
index 00000000..9928fb16
--- /dev/null
+++ b/examples/http-swift.png
Binary files differ
diff --git a/examples/http-webdav.png b/examples/http-webdav.png
new file mode 100644
index 00000000..c37c3de5
--- /dev/null
+++ b/examples/http-webdav.png
Binary files differ
diff --git a/examples/ime.png b/examples/ime.png
new file mode 100644
index 00000000..f636f5e7
--- /dev/null
+++ b/examples/ime.png
Binary files differ
diff --git a/examples/iometer-file-access-server.png b/examples/iometer-file-access-server.png
new file mode 100644
index 00000000..e3124554
--- /dev/null
+++ b/examples/iometer-file-access-server.png
Binary files differ
diff --git a/examples/jesd219.fio b/examples/jesd219.fio
index 24f16f77..deddd9a7 100644
--- a/examples/jesd219.fio
+++ b/examples/jesd219.fio
@@ -17,4 +17,4 @@ bssplit=512/4:1024/1:1536/1:2048/1:2560/1:3072/1:3584/1:4k/67:8k/10:16k/7:32k/3:
blockalign=4k
random_distribution=zoned:50/5:30/15:20/80
filename=/dev/nvme0n1
-group_reporting=1
+group_reporting
diff --git a/examples/jesd219.png b/examples/jesd219.png
new file mode 100644
index 00000000..73b5a124
--- /dev/null
+++ b/examples/jesd219.png
Binary files differ
diff --git a/examples/latency-profile.png b/examples/latency-profile.png
new file mode 100644
index 00000000..50650df8
--- /dev/null
+++ b/examples/latency-profile.png
Binary files differ
diff --git a/examples/libcufile-cufile.fio b/examples/libcufile-cufile.fio
new file mode 100644
index 00000000..94a64b5a
--- /dev/null
+++ b/examples/libcufile-cufile.fio
@@ -0,0 +1,42 @@
+# Example libcufile job, using cufile I/O
+#
+# Required environment variables:
+# GPU_DEV_IDS : refer to option 'gpu_dev_ids'
+# FIO_DIR : 'directory'. This job uses cuda_io=cufile, so path(s) must
+# point to GPUDirect Storage filesystem(s)
+#
+
+[global]
+ioengine=libcufile
+directory=${FIO_DIR}
+gpu_dev_ids=${GPU_DEV_IDS}
+cuda_io=cufile
+# 'direct' must be 1 when using cuda_io=cufile
+direct=1
+# Performance is negatively affected if 'bs' is not a multiple of 4k.
+# Refer to GDS cuFile documentation.
+bs=1m
+size=1m
+numjobs=16
+# cudaMalloc fails if too many processes attach to the GPU, use threads.
+thread
+
+[read]
+rw=read
+
+[write]
+rw=write
+
+[randread]
+rw=randread
+
+[randwrite]
+rw=randwrite
+
+[verify]
+rw=write
+verify=md5
+
+[randverify]
+rw=randwrite
+verify=md5
diff --git a/examples/libcufile-cufile.png b/examples/libcufile-cufile.png
new file mode 100644
index 00000000..f3758e5d
--- /dev/null
+++ b/examples/libcufile-cufile.png
Binary files differ
diff --git a/examples/libcufile-posix.fio b/examples/libcufile-posix.fio
new file mode 100644
index 00000000..2bce22e6
--- /dev/null
+++ b/examples/libcufile-posix.fio
@@ -0,0 +1,41 @@
+# Example libcufile job, using POSIX I/O
+#
+# Required environment variables:
+# GPU_DEV_IDS : refer to option 'gpu_dev_ids'
+# FIO_DIR : 'directory'. cuda_io=posix, so the path(s) may point
+# to any POSIX filesystem(s)
+#
+
+[global]
+ioengine=libcufile
+directory=${FIO_DIR}
+gpu_dev_ids=${GPU_DEV_IDS}
+cuda_io=posix
+# 'direct' may be 1 or 0 when using cuda_io=posix
+direct=0
+# there are no unusual requirements for 'bs' when cuda_io=posix
+bs=1m
+size=1G
+numjobs=16
+# cudaMalloc fails if too many processes attach to the GPU, use threads
+thread
+
+[read]
+rw=read
+
+[write]
+rw=write
+
+[randread]
+rw=randread
+
+[randwrite]
+rw=randwrite
+
+[verify]
+rw=write
+verify=md5
+
+[randverify]
+rw=randwrite
+verify=md5
diff --git a/examples/libcufile-posix.png b/examples/libcufile-posix.png
new file mode 100644
index 00000000..7818feb4
--- /dev/null
+++ b/examples/libcufile-posix.png
Binary files differ
diff --git a/examples/libhdfs.png b/examples/libhdfs.png
new file mode 100644
index 00000000..e774c911
--- /dev/null
+++ b/examples/libhdfs.png
Binary files differ
diff --git a/examples/libiscsi.png b/examples/libiscsi.png
new file mode 100644
index 00000000..d0006cc0
--- /dev/null
+++ b/examples/libiscsi.png
Binary files differ
diff --git a/examples/libpmem.fio b/examples/libpmem.fio
index 65b1d687..3b854a32 100644
--- a/examples/libpmem.fio
+++ b/examples/libpmem.fio
@@ -1,9 +1,9 @@
[global]
bs=4k
-size=8g
+size=10g
ioengine=libpmem
norandommap
-time_based=1
+time_based
group_reporting
invalidate=1
disable_lat=1
@@ -13,21 +13,11 @@ clat_percentiles=0
iodepth=1
iodepth_batch=1
-thread=1
+thread
numjobs=1
runtime=300
#
-# In case of 'scramble_buffers=1', the source buffer
-# is rewritten with a random value every write operations.
-#
-# But when 'scramble_buffers=0' is set, the source buffer isn't
-# rewritten. So it will be likely that the source buffer is in CPU
-# cache and it seems to be high performance.
-#
-scramble_buffers=0
-
-#
# depends on direct option, flags are set for pmem_memcpy() call:
# direct=1 - PMEM_F_MEM_NONTEMPORAL,
# direct=0 - PMEM_F_MEM_TEMPORAL.
@@ -39,9 +29,19 @@ direct=1
#
sync=1
+#
+# In case of 'scramble_buffers=1', the source buffer
+# is rewritten with a random value every write operation.
+#
+# But when 'scramble_buffers=0' is set, the source buffer isn't
+# rewritten. So it will be likely that the source buffer is in CPU
+# cache and it seems to be high write performance.
+#
+scramble_buffers=1
#
-# Setting for fio process's CPU Node and Memory Node
+# Setting for fio process's CPU Node and Memory Node.
+# Set proper node below or use `numactl` command along with FIO.
#
numa_cpu_nodes=0
numa_mem_policy=bind:0
@@ -53,21 +53,22 @@ cpus_allowed_policy=split
#
# The libpmem engine does IO to files in a DAX-mounted filesystem.
-# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# The filesystem should be created on a Non-Volatile DIMM (e.g /dev/pmem0)
# and then mounted with the '-o dax' option. Note that the engine
# accesses the underlying NVDIMM directly, bypassing the kernel block
# layer, so the usual filesystem/disk performance monitoring tools such
# as iostat will not provide useful data.
#
-directory=/mnt/pmem0
+#filename=/mnt/pmem/somefile
+directory=/mnt/pmem
[libpmem-seqwrite]
rw=write
stonewall
-#[libpmem-seqread]
-#rw=read
-#stonewall
+[libpmem-seqread]
+rw=read
+stonewall
#[libpmem-randwrite]
#rw=randwrite
diff --git a/examples/libpmem.png b/examples/libpmem.png
new file mode 100644
index 00000000..8a9a1432
--- /dev/null
+++ b/examples/libpmem.png
Binary files differ
diff --git a/examples/librpma_apm-client.fio b/examples/librpma_apm-client.fio
new file mode 100644
index 00000000..82a5d20c
--- /dev/null
+++ b/examples/librpma_apm-client.fio
@@ -0,0 +1,24 @@
+# Example of the librpma_apm_client job
+
+[global]
+ioengine=librpma_apm_client
+create_serialize=0 # (required) forces specific initiation sequence
+serverip=[serverip] #IP address the server is listening on
+port=7204 # port(s) the server will listen on, <port; port + numjobs - 1> will be used
+thread
+
+# The client will get a remote memory region description after establishing
+# a connection.
+
+[client]
+numjobs=1 # number of parallel connections
+group_reporting=1
+sync=1 # 1 is the best for latency measurements, 0 for bandwidth
+iodepth=2 # total number of ious
+iodepth_batch_submit=1 # number of ious to be submitted at once
+rw=write # read/write/randread/randwrite/readwrite/rw
+rwmixread=70 # % of a mixed workload that should be reads
+blocksize=4KiB
+ramp_time=15s # gives some time to stabilize the workload
+time_based
+runtime=60s # run the workload for the specified period of time
diff --git a/examples/librpma_apm-client.png b/examples/librpma_apm-client.png
new file mode 100644
index 00000000..2fe02cdf
--- /dev/null
+++ b/examples/librpma_apm-client.png
Binary files differ
diff --git a/examples/librpma_apm-server.fio b/examples/librpma_apm-server.fio
new file mode 100644
index 00000000..062b5215
--- /dev/null
+++ b/examples/librpma_apm-server.fio
@@ -0,0 +1,26 @@
+# Example of the librpma_apm_server job
+
+[global]
+ioengine=librpma_apm_server
+create_serialize=0 # (required) forces specific initiation sequence
+kb_base=1000 # turn on the straight units handling (non-compatibility mode)
+serverip=[serverip] # IP address to listen on
+port=7204 # port(s) the server jobs will listen on, ports <port; port + numjobs - 1> will be used
+thread
+
+# The server side spawns one thread for each expected connection from
+# the client-side, opens and registers the range dedicated for this thread
+# (a workspace) from the provided memory.
+# Each of the server threads accepts a connection on the dedicated port
+# (different for each and every working thread) and waits for it to end up,
+# and closes itself.
+
+[server]
+# set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible
+# (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)
+direct_write_to_pmem=0
+
+numjobs=1 # number of expected incomming connections
+size=100MiB # size of workspace for a single connection
+filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM
+# filename=/dev/dax1.0
diff --git a/examples/librpma_apm-server.png b/examples/librpma_apm-server.png
new file mode 100644
index 00000000..f78ae02e
--- /dev/null
+++ b/examples/librpma_apm-server.png
Binary files differ
diff --git a/examples/librpma_gpspm-client.fio b/examples/librpma_gpspm-client.fio
new file mode 100644
index 00000000..843382df
--- /dev/null
+++ b/examples/librpma_gpspm-client.fio
@@ -0,0 +1,23 @@
+# Example of the librpma_gpspm_client job
+
+[global]
+ioengine=librpma_gpspm_client
+create_serialize=0 # (required) forces specific initiation sequence
+serverip=[serverip] #IP address the server is listening on
+port=7204 # port(s) the server will listen on, <port; port + numjobs - 1> will be used
+thread
+
+# The client will get a remote memory region description after establishing
+# a connection.
+
+[client]
+numjobs=1 # number of parallel connections
+group_reporting=1
+sync=1 # 1 is the best for latency measurements, 0 for bandwidth
+iodepth=2 # total number of ious
+iodepth_batch_submit=1 # number of ious to be submitted at once
+rw=write # write/randwrite
+blocksize=4KiB
+ramp_time=15s # gives some time to stabilize the workload
+time_based
+runtime=60s # run the workload for the specified period of time
diff --git a/examples/librpma_gpspm-client.png b/examples/librpma_gpspm-client.png
new file mode 100644
index 00000000..0c975a27
--- /dev/null
+++ b/examples/librpma_gpspm-client.png
Binary files differ
diff --git a/examples/librpma_gpspm-server.fio b/examples/librpma_gpspm-server.fio
new file mode 100644
index 00000000..67e92a28
--- /dev/null
+++ b/examples/librpma_gpspm-server.fio
@@ -0,0 +1,33 @@
+# Example of the librpma_gpspm_server job
+
+[global]
+ioengine=librpma_gpspm_server
+create_serialize=0 # (required) forces specific initiation sequence
+kb_base=1000 # turn on the straight units handling (non-compatibility mode)
+serverip=[serverip] #IP address to listen on
+port=7204 # port(s) the server jobs will listen on, ports <port; port + numjobs - 1> will be used
+thread
+
+# The server side spawns one thread for each expected connection from
+# the client-side, opens and registers the range dedicated for this thread
+# (a workspace) from the provided memory.
+# Each of the server threads accepts a connection on the dedicated port
+# (different for each and every working thread), accepts and executes flush
+# requests, and sends back a flush response for each of the requests.
+# When the client is done it sends the termination notice to the server's thread.
+
+[server]
+# set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible
+# (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)
+direct_write_to_pmem=0
+# set to 0 (false) to wait for completion instead of busy-wait polling completion.
+busy_wait_polling=1
+numjobs=1 # number of expected incomming connections
+iodepth=2 # number of parallel GPSPM requests
+size=100MiB # size of workspace for a single connection
+filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM
+# filename=/dev/dax1.0
+
+# The client will terminate the server when the client will end up its job.
+time_based
+runtime=365d
diff --git a/examples/librpma_gpspm-server.png b/examples/librpma_gpspm-server.png
new file mode 100644
index 00000000..56124533
--- /dev/null
+++ b/examples/librpma_gpspm-server.png
Binary files differ
diff --git a/examples/libzbc-rand-write.fio b/examples/libzbc-rand-write.fio
index ce5870e4..41496219 100644
--- a/examples/libzbc-rand-write.fio
+++ b/examples/libzbc-rand-write.fio
@@ -12,7 +12,7 @@ max_open_zones=32
bs=512K
direct=1
numjobs=16
-time_based=1
+time_based
runtime=300
[dev1]
diff --git a/examples/libzbc-rand-write.png b/examples/libzbc-rand-write.png
new file mode 100644
index 00000000..1d277412
--- /dev/null
+++ b/examples/libzbc-rand-write.png
Binary files differ
diff --git a/examples/libzbc-seq-read.png b/examples/libzbc-seq-read.png
new file mode 100644
index 00000000..5a532228
--- /dev/null
+++ b/examples/libzbc-seq-read.png
Binary files differ
diff --git a/examples/mtd.fio b/examples/mtd.fio
index e5dcea4c..0a7f2bae 100644
--- a/examples/mtd.fio
+++ b/examples/mtd.fio
@@ -6,7 +6,7 @@ ignore_error=,EIO
blocksize=512,512,16384
skip_bad=1
-[write]
+[trim]
stonewall
rw=trim
@@ -14,7 +14,7 @@ rw=trim
stonewall
rw=write
-[write]
+[trimwrite]
stonewall
block_error_percentiles=1
rw=trimwrite
diff --git a/examples/mtd.png b/examples/mtd.png
new file mode 100644
index 00000000..8cb3692e
--- /dev/null
+++ b/examples/mtd.png
Binary files differ
diff --git a/examples/nbd.png b/examples/nbd.png
new file mode 100644
index 00000000..e3bcf610
--- /dev/null
+++ b/examples/nbd.png
Binary files differ
diff --git a/examples/netio.png b/examples/netio.png
new file mode 100644
index 00000000..81afd41d
--- /dev/null
+++ b/examples/netio.png
Binary files differ
diff --git a/examples/netio_multicast.png b/examples/netio_multicast.png
new file mode 100644
index 00000000..f07ab4b7
--- /dev/null
+++ b/examples/netio_multicast.png
Binary files differ
diff --git a/examples/nfs.fio b/examples/nfs.fio
new file mode 100644
index 00000000..f856cebf
--- /dev/null
+++ b/examples/nfs.fio
@@ -0,0 +1,22 @@
+[global]
+nfs_url=nfs://127.0.0.1/nfs
+blocksize=524288
+iodepth=10
+ioengine=nfs
+size=104857600
+lat_percentiles=1
+group_reporting
+numjobs=10
+ramp_time=5s
+filename_format=myfiles.$clientuid.$jobnum.$filenum
+time_based=1
+
+[write]
+rw=write
+runtime=10s
+stonewall
+
+[read]
+wait_for=write
+rw=randread
+runtime=10s
diff --git a/examples/nfs.png b/examples/nfs.png
new file mode 100644
index 00000000..29dbca0d
--- /dev/null
+++ b/examples/nfs.png
Binary files differ
diff --git a/examples/null.fio b/examples/null.fio
index 9d2f3e00..4534cbdd 100644
--- a/examples/null.fio
+++ b/examples/null.fio
@@ -7,4 +7,3 @@ ioengine=null
size=100g
rw=randread
norandommap
-time_based=0
diff --git a/examples/null.png b/examples/null.png
new file mode 100644
index 00000000..052671db
--- /dev/null
+++ b/examples/null.png
Binary files differ
diff --git a/examples/numa.png b/examples/numa.png
new file mode 100644
index 00000000..1ef45759
--- /dev/null
+++ b/examples/numa.png
Binary files differ
diff --git a/examples/pmemblk.fio b/examples/pmemblk.fio
index 2d5ecfce..59bb2a8a 100644
--- a/examples/pmemblk.fio
+++ b/examples/pmemblk.fio
@@ -2,7 +2,7 @@
bs=1m
ioengine=pmemblk
norandommap
-time_based=1
+time_based
runtime=30
group_reporting
disable_lat=1
@@ -19,7 +19,7 @@ cpus_allowed_policy=split
#
iodepth=1
direct=1
-thread=1
+thread
numjobs=16
#
# Unlink can be used to remove the files when done, but if you are
@@ -55,7 +55,7 @@ unlink=0
# size, this is not required.
#
filename=/pmem0/fio-test,4096,1024
-filename=/pmem1/fio-test,4096,1024
+#filename=/pmem1/fio-test,4096,1024
[pmemblk-write]
rw=randwrite
diff --git a/examples/pmemblk.png b/examples/pmemblk.png
new file mode 100644
index 00000000..250e254b
--- /dev/null
+++ b/examples/pmemblk.png
Binary files differ
diff --git a/examples/poisson-rate-submission.png b/examples/poisson-rate-submission.png
new file mode 100644
index 00000000..739c2560
--- /dev/null
+++ b/examples/poisson-rate-submission.png
Binary files differ
diff --git a/examples/rados.png b/examples/rados.png
new file mode 100644
index 00000000..91bd61a0
--- /dev/null
+++ b/examples/rados.png
Binary files differ
diff --git a/examples/rand-zones.png b/examples/rand-zones.png
new file mode 100644
index 00000000..13cbfb47
--- /dev/null
+++ b/examples/rand-zones.png
Binary files differ
diff --git a/examples/rbd.png b/examples/rbd.png
new file mode 100644
index 00000000..f1186139
--- /dev/null
+++ b/examples/rbd.png
Binary files differ
diff --git a/examples/rdmaio-client.png b/examples/rdmaio-client.png
new file mode 100644
index 00000000..4e4bc289
--- /dev/null
+++ b/examples/rdmaio-client.png
Binary files differ
diff --git a/examples/rdmaio-server.png b/examples/rdmaio-server.png
new file mode 100644
index 00000000..fc344725
--- /dev/null
+++ b/examples/rdmaio-server.png
Binary files differ
diff --git a/examples/ssd-steadystate.png b/examples/ssd-steadystate.png
new file mode 100644
index 00000000..eb27f8a4
--- /dev/null
+++ b/examples/ssd-steadystate.png
Binary files differ
diff --git a/examples/ssd-test.png b/examples/ssd-test.png
new file mode 100644
index 00000000..a92ed153
--- /dev/null
+++ b/examples/ssd-test.png
Binary files differ
diff --git a/examples/steadystate.fio b/examples/steadystate.fio
index 26fb8083..a38a3438 100644
--- a/examples/steadystate.fio
+++ b/examples/steadystate.fio
@@ -7,7 +7,7 @@
[global]
threads=1
-group_reporting=1
+group_reporting
time_based
size=128m
diff --git a/examples/steadystate.png b/examples/steadystate.png
new file mode 100644
index 00000000..4bb90484
--- /dev/null
+++ b/examples/steadystate.png
Binary files differ
diff --git a/examples/surface-scan.fio b/examples/surface-scan.fio
index dc3373a2..98faf69a 100644
--- a/examples/surface-scan.fio
+++ b/examples/surface-scan.fio
@@ -1,7 +1,7 @@
; writes 512 byte verification blocks until the disk is full,
; then verifies written data
[global]
-thread=1
+thread
bs=64k
direct=1
ioengine=sync
diff --git a/examples/surface-scan.png b/examples/surface-scan.png
new file mode 100644
index 00000000..00573808
--- /dev/null
+++ b/examples/surface-scan.png
Binary files differ
diff --git a/examples/test.png b/examples/test.png
new file mode 100644
index 00000000..6be50029
--- /dev/null
+++ b/examples/test.png
Binary files differ
diff --git a/examples/tiobench-example.png b/examples/tiobench-example.png
new file mode 100644
index 00000000..14410326
--- /dev/null
+++ b/examples/tiobench-example.png
Binary files differ
diff --git a/examples/waitfor.fio b/examples/waitfor.fio
index 95fad005..096c3153 100644
--- a/examples/waitfor.fio
+++ b/examples/waitfor.fio
@@ -1,6 +1,6 @@
[global]
threads=1
-group_reporting=1
+group_reporting
filename=/tmp/data
filesize=128m
diff --git a/examples/waitfor.png b/examples/waitfor.png
new file mode 100644
index 00000000..64e4bf94
--- /dev/null
+++ b/examples/waitfor.png
Binary files differ
diff --git a/examples/zbd-rand-write.fio b/examples/zbd-rand-write.fio
index 1b3f2088..46cddd06 100644
--- a/examples/zbd-rand-write.fio
+++ b/examples/zbd-rand-write.fio
@@ -12,7 +12,7 @@ max_open_zones=32
bs=512K
direct=1
numjobs=16
-time_based=1
+time_based
runtime=180
[dev1]
diff --git a/examples/zbd-rand-write.png b/examples/zbd-rand-write.png
new file mode 100644
index 00000000..d58721be
--- /dev/null
+++ b/examples/zbd-rand-write.png
Binary files differ
diff --git a/examples/zbd-seq-read.png b/examples/zbd-seq-read.png
new file mode 100644
index 00000000..b81a08c4
--- /dev/null
+++ b/examples/zbd-seq-read.png
Binary files differ
diff --git a/examples/zipf.png b/examples/zipf.png
new file mode 100644
index 00000000..cb2a9816
--- /dev/null
+++ b/examples/zipf.png
Binary files differ
diff --git a/file.h b/file.h
index 493ec04a..faf65a2a 100644
--- a/file.h
+++ b/file.h
@@ -207,6 +207,7 @@ extern "C" {
extern int __must_check generic_open_file(struct thread_data *, struct fio_file *);
extern int __must_check generic_close_file(struct thread_data *, struct fio_file *);
extern int __must_check generic_get_file_size(struct thread_data *, struct fio_file *);
+extern int __must_check generic_prepopulate_file(struct thread_data *, struct fio_file *);
#ifdef __cplusplus
}
#endif
diff --git a/filehash.c b/filehash.c
index b55ab734..71ec7b18 100644
--- a/filehash.c
+++ b/filehash.c
@@ -60,10 +60,8 @@ static struct fio_file *__lookup_file_hash(const char *name)
if (!f->file_name)
continue;
- if (!strcmp(f->file_name, name)) {
- assert(f->fd != -1);
+ if (!strcmp(f->file_name, name))
return f;
- }
}
return NULL;
diff --git a/filesetup.c b/filesetup.c
index e44f31c7..296de5a1 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -226,18 +226,22 @@ static int extend_file(struct thread_data *td, struct fio_file *f)
if (r < 0) {
int __e = errno;
- if (__e == ENOSPC) {
+ if (__e == ENOSPC || __e == EDQUOT) {
+ const char *__e_name;
if (td->o.fill_device)
break;
- log_info("fio: ENOSPC on laying out "
- "file, stopping\n");
- break;
+ if (__e == ENOSPC)
+ __e_name = "ENOSPC";
+ else
+ __e_name = "EDQUOT";
+ log_info("fio: %s on laying out "
+ "file, stopping\n", __e_name);
}
td_verror(td, errno, "write");
} else
td_verror(td, EIO, "write");
- break;
+ goto err;
}
}
@@ -339,6 +343,95 @@ error:
return ret;
}
+/*
+ * Generic function to prepopulate regular file with data.
+ * Useful if you want to make sure I/O engine has data to read.
+ * Leaves f->fd open on success, caller must close.
+ */
+int generic_prepopulate_file(struct thread_data *td, struct fio_file *f)
+{
+ int flags;
+ unsigned long long left, bs;
+ char *b = NULL;
+
+ /* generic function for regular files only */
+ assert(f->filetype == FIO_TYPE_FILE);
+
+ if (read_only) {
+ log_err("fio: refusing to write a file due to read-only\n");
+ return 0;
+ }
+
+ flags = O_WRONLY;
+ if (td->o.allow_create)
+ flags |= O_CREAT;
+
+#ifdef WIN32
+ flags |= _O_BINARY;
+#endif
+
+ dprint(FD_FILE, "open file %s, flags %x\n", f->file_name, flags);
+ f->fd = open(f->file_name, flags, 0644);
+ if (f->fd < 0) {
+ int err = errno;
+
+ if (err == ENOENT && !td->o.allow_create)
+ log_err("fio: file creation disallowed by "
+ "allow_file_create=0\n");
+ else
+ td_verror(td, err, "open");
+ return 1;
+ }
+
+ left = f->real_file_size;
+ bs = td->o.max_bs[DDIR_WRITE];
+ if (bs > left)
+ bs = left;
+
+ b = malloc(bs);
+ if (!b) {
+ td_verror(td, errno, "malloc");
+ goto err;
+ }
+
+ while (left && !td->terminate) {
+ ssize_t r;
+
+ if (bs > left)
+ bs = left;
+
+ fill_io_buffer(td, b, bs, bs);
+
+ r = write(f->fd, b, bs);
+
+ if (r > 0) {
+ left -= r;
+ } else {
+ td_verror(td, errno, "write");
+ goto err;
+ }
+ }
+
+ if (td->terminate) {
+ dprint(FD_FILE, "terminate unlink %s\n", f->file_name);
+ td_io_unlink_file(td, f);
+ } else if (td->o.create_fsync) {
+ if (fsync(f->fd) < 0) {
+ td_verror(td, errno, "fsync");
+ goto err;
+ }
+ }
+
+ free(b);
+ return 0;
+err:
+ close(f->fd);
+ f->fd = -1;
+ if (b)
+ free(b);
+ return 1;
+}
+
unsigned long long get_rand_file_size(struct thread_data *td)
{
unsigned long long ret, sized;
@@ -655,8 +748,7 @@ int generic_open_file(struct thread_data *td, struct fio_file *f)
}
flags |= OS_O_DIRECT | FIO_O_ATOMIC;
}
- if (td->o.sync_io)
- flags |= O_SYNC;
+ flags |= td->o.sync_io;
if (td->o.create_on_open && td->o.allow_create)
flags |= O_CREAT;
skip_flags:
@@ -817,7 +909,7 @@ static unsigned long long get_fs_free_counts(struct thread_data *td)
} else if (f->filetype != FIO_TYPE_FILE)
continue;
- snprintf(buf, ARRAY_SIZE(buf), "%s", f->file_name);
+ snprintf(buf, FIO_ARRAY_SIZE(buf), "%s", f->file_name);
if (stat(buf, &sb) < 0) {
if (errno != ENOENT)
@@ -840,7 +932,7 @@ static unsigned long long get_fs_free_counts(struct thread_data *td)
continue;
fm = calloc(1, sizeof(*fm));
- snprintf(fm->__base, ARRAY_SIZE(fm->__base), "%s", buf);
+ snprintf(fm->__base, FIO_ARRAY_SIZE(fm->__base), "%s", buf);
fm->base = basename(fm->__base);
fm->key = sb.st_dev;
flist_add(&fm->list, &list);
@@ -1031,6 +1123,13 @@ int setup_files(struct thread_data *td)
if (o->read_iolog_file)
goto done;
+ if (td->o.zone_mode == ZONE_MODE_ZBD) {
+ err = zbd_init_files(td);
+ if (err)
+ goto err_out;
+ }
+ zbd_recalc_options_with_zone_granularity(td);
+
/*
* check sizes. if the files/devices do not exist and the size
* isn't passed to fio, abort.
@@ -1201,7 +1300,7 @@ int setup_files(struct thread_data *td)
o->size = total_size;
if (o->size < td_min_bs(td)) {
- log_err("fio: blocksize too large for data set\n");
+ log_err("fio: blocksize is larger than data set range\n");
goto err_out;
}
@@ -1260,6 +1359,43 @@ int setup_files(struct thread_data *td)
goto err_out;
/*
+ * Prepopulate files with data. It might be expected to read some
+ * "real" data instead of zero'ed files (if no writes to file occurred
+ * prior to a read job). Engine has to provide a way to do that.
+ */
+ if (td->io_ops->prepopulate_file) {
+ temp_stall_ts = 1;
+
+ for_each_file(td, f, i) {
+ if (output_format & FIO_OUTPUT_NORMAL) {
+ log_info("%s: Prepopulating IO file (%s)\n",
+ o->name, f->file_name);
+ }
+
+ err = td->io_ops->prepopulate_file(td, f);
+ if (err)
+ break;
+
+ err = __file_invalidate_cache(td, f, f->file_offset,
+ f->io_size);
+
+ /*
+ * Shut up static checker
+ */
+ if (f->fd != -1)
+ close(f->fd);
+
+ f->fd = -1;
+ if (err)
+ break;
+ }
+ temp_stall_ts = 0;
+ }
+
+ if (err)
+ goto err_out;
+
+ /*
* iolog already set the total io size, if we read back
* stored entries.
*/
@@ -1271,16 +1407,17 @@ int setup_files(struct thread_data *td)
}
done:
- if (o->create_only)
- td->done = 1;
-
- td_restore_runstate(td, old_state);
-
if (td->o.zone_mode == ZONE_MODE_ZBD) {
err = zbd_setup_files(td);
if (err)
goto err_out;
}
+
+ if (o->create_only)
+ td->done = 1;
+
+ td_restore_runstate(td, old_state);
+
return 0;
err_offset:
@@ -1321,11 +1458,11 @@ static void __init_rand_distribution(struct thread_data *td, struct fio_file *f)
seed = td->rand_seeds[4];
if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
- zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, seed);
+ zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, td->o.random_center.u.f, seed);
else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
- pareto_init(&f->zipf, nranges, td->o.pareto_h.u.f, seed);
+ pareto_init(&f->zipf, nranges, td->o.pareto_h.u.f, td->o.random_center.u.f, seed);
else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
- gauss_init(&f->gauss, nranges, td->o.gauss_dev.u.f, seed);
+ gauss_init(&f->gauss, nranges, td->o.gauss_dev.u.f, td->o.random_center.u.f, seed);
}
static bool init_rand_distribution(struct thread_data *td)
diff --git a/fio.1 b/fio.1
index a881277c..9c12ad13 100644
--- a/fio.1
+++ b/fio.1
@@ -288,6 +288,15 @@ Pi means pebi (Pi) or 1024**5
.PD
.RE
.P
+For Zone Block Device Mode:
+.RS
+.P
+.PD 0
+z means Zone
+.P
+.PD
+.RE
+.P
With `kb_base=1024' (the default), the unit prefixes are opposite
from those specified in the SI and IEC 80000-13 standards to provide
compatibility with old scripts. For example, 4k means 4096.
@@ -348,6 +357,9 @@ us or usec means microseconds
.PD
.RE
.P
+`z' suffix specifies that the value is measured in zones.
+Value is recalculated once block device's zone size becomes known.
+.P
If the option accepts an upper and lower range, use a colon ':' or
minus '\-' to separate such values. See \fIirange\fR parameter type.
If the lower value specified happens to be larger than the upper value
@@ -584,6 +596,9 @@ string:
.B $jobname
The name of the worker thread or process.
.TP
+.B $clientuid
+IP of the fio process when using client/server mode.
+.TP
.B $jobnum
The incremental number of the worker thread or process.
.TP
@@ -684,7 +699,8 @@ of how that would work.
.TP
.BI ioscheduler \fR=\fPstr
Attempt to switch the device hosting the file to the specified I/O scheduler
-before running.
+before running. If the file is a pipe, a character device file or if device
+hosting the file could not be determined, this option is ignored.
.TP
.BI create_serialize \fR=\fPbool
If true, serialize the file creation for the jobs. This may be handy to
@@ -780,7 +796,7 @@ If not specified it defaults to the zone size. If the target device is a zoned
block device, the zone capacity is obtained from the device information and this
option is ignored.
.TP
-.BI zoneskip \fR=\fPint
+.BI zoneskip \fR=\fPint[z]
For \fBzonemode\fR=strided, the number of bytes to skip after \fBzonesize\fR
bytes of data have been transferred.
@@ -819,6 +835,11 @@ threads/processes.
.BI job_max_open_zones \fR=\fPint
Limit on the number of simultaneously opened zones per single thread/process.
.TP
+.BI ignore_zone_limits \fR=\fPbool
+If this isn't set, fio will query the max open zones limit from the zoned block
+device, and exit if the specified \fBmax_open_zones\fR value is larger than the
+limit reported by the device. Default: false.
+.TP
.BI zone_reset_threshold \fR=\fPfloat
A number between zero and one that indicates the ratio of logical blocks with
data to the total number of logical blocks in the test above which zones
@@ -918,10 +939,32 @@ behaves in a similar fashion, except it sends the same offset 8 number of
times before generating a new offset.
.RE
.TP
-.BI unified_rw_reporting \fR=\fPbool
+.BI unified_rw_reporting \fR=\fPstr
Fio normally reports statistics on a per data direction basis, meaning that
-reads, writes, and trims are accounted and reported separately. If this
-option is set fio sums the results and report them as "mixed" instead.
+reads, writes, and trims are accounted and reported separately. This option
+determines whether fio reports the results normally, summed together, or as
+both options.
+Accepted values are:
+.RS
+.TP
+.B none
+Normal statistics reporting.
+.TP
+.B mixed
+Statistics are summed per data direction and reported together.
+.TP
+.B both
+Statistics are reported normally, followed by the mixed statistics.
+.TP
+.B 0
+Backward-compatible alias for \fBnone\fR.
+.TP
+.B 1
+Backward-compatible alias for \fBmixed\fR.
+.TP
+.B 2
+Alias for \fBboth\fR.
+.RE
.TP
.BI randrepeat \fR=\fPbool
Seed the random number generator used for random I/O patterns in a
@@ -1030,22 +1073,23 @@ The values are all relative to each other, and no absolute meaning
should be associated with them.
.RE
.TP
-.BI offset \fR=\fPint
+.BI offset \fR=\fPint[%|z]
Start I/O at the provided offset in the file, given as either a fixed size in
-bytes or a percentage. If a percentage is given, the generated offset will be
+bytes, zones or a percentage. If a percentage is given, the generated offset will be
aligned to the minimum \fBblocksize\fR or to the value of \fBoffset_align\fR if
provided. Data before the given offset will not be touched. This
effectively caps the file size at `real_size \- offset'. Can be combined with
\fBsize\fR to constrain the start and end range of the I/O workload.
A percentage can be specified by a number between 1 and 100 followed by '%',
-for example, `offset=20%' to specify 20%.
+for example, `offset=20%' to specify 20%. In ZBD mode, value can be set as
+number of zones using 'z'.
.TP
.BI offset_align \fR=\fPint
If set to non-zero value, the byte offset generated by a percentage \fBoffset\fR
is aligned upwards to this value. Defaults to 0 meaning that a percentage
offset is aligned to the minimum block size.
.TP
-.BI offset_increment \fR=\fPint
+.BI offset_increment \fR=\fPint[%|z]
If this is provided, then the real offset becomes `\fBoffset\fR + \fBoffset_increment\fR
* thread_number', where the thread number is a counter that starts at 0 and
is incremented for each sub-job (i.e. when \fBnumjobs\fR option is
@@ -1053,7 +1097,8 @@ specified). This option is useful if there are several jobs which are
intended to operate on a file in parallel disjoint segments, with even
spacing between the starting points. Percentages can be used for this option.
If a percentage is given, the generated offset will be aligned to the minimum
-\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.
+\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.In ZBD mode, value
+can be set as number of zones using 'z'.
.TP
.BI number_ios \fR=\fPint
Fio will normally perform I/Os until it has exhausted the size of the region
@@ -1132,7 +1177,7 @@ first. This may interfere with a given rate setting, if fio is asked to
limit reads or writes to a certain rate. If that is the case, then the
distribution may be skewed. Default: 50.
.TP
-.BI random_distribution \fR=\fPstr:float[,str:float][,str:float]
+.BI random_distribution \fR=\fPstr:float[:float][,str:float][,str:float]
By default, fio will use a completely uniform random distribution when asked
to perform random I/O. Sometimes it is useful to skew the distribution in
specific ways, ensuring that some parts of the data is more hot than others.
@@ -1168,6 +1213,14 @@ option. If a non\-uniform model is used, fio will disable use of the random
map. For the \fBnormal\fR distribution, a normal (Gaussian) deviation is
supplied as a value between 0 and 100.
.P
+The second, optional float is allowed for \fBpareto\fR, \fBzipf\fR and \fBnormal\fR
+distributions. It allows to set base of distribution in non-default place, giving
+more control over most probable outcome. This value is in range [0-1] which maps linearly to
+range of possible random values.
+Defaults are: random for \fBpareto\fR and \fBzipf\fR, and 0.5 for \fBnormal\fR.
+If you wanted to use \fBzipf\fR with a `theta` of 1.2 centered on 1/4 of allowed value range,
+you would use `random_distibution=zipf:1.2:0.25`.
+.P
For a \fBzoned\fR distribution, fio supports specifying percentages of I/O
access that should fall within what range of the file or device. For
example, given a criteria of:
@@ -1456,15 +1509,79 @@ all \-\- this option only controls the distribution of unique buffers. Setting
this option will also enable \fBrefill_buffers\fR to prevent every buffer
being identical.
.TP
+.BI dedupe_mode \fR=\fPstr
+If \fBdedupe_percentage\fR is given, then this option controls how fio
+generates the dedupe buffers.
+.RS
+.RS
+.TP
+.B repeat
+.P
+.RS
+Generate dedupe buffers by repeating previous writes
+.RE
+.TP
+.B working_set
+.P
+.RS
+Generate dedupe buffers from working set
+.RE
+.RE
+.P
+\fBrepeat\fR is the default option for fio. Dedupe buffers are generated
+by repeating previous unique write.
+
+\fBworking_set\fR is a more realistic workload.
+With \fBworking_set\fR, \fBdedupe_working_set_percentage\fR should be provided.
+Given that, fio will use the initial unique write buffers as its working set.
+Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+Note that by using \fBworking_set\fR the dedupe percentage will converge
+to the desired over time while \fBrepeat\fR maintains the desired percentage
+throughout the job.
+.RE
+.RE
+.TP
+.BI dedupe_working_set_percentage \fR=\fPint
+If \fBdedupe_mode\fR is set to \fBworking_set\fR, then this controls
+the percentage of size of the file or device used as the buffers
+fio will choose to generate the dedupe buffers from
+.P
+.RS
+Note that \fBsize\fR needs to be explicitly provided and only 1 file
+per job is supported
+.RE
+.TP
.BI invalidate \fR=\fPbool
Invalidate the buffer/page cache parts of the files to be used prior to
starting I/O if the platform and file type support it. Defaults to true.
This will be ignored if \fBpre_read\fR is also specified for the
same job.
.TP
-.BI sync \fR=\fPbool
-Use synchronous I/O for buffered writes. For the majority of I/O engines,
-this means using O_SYNC. Default: false.
+.BI sync \fR=\fPstr
+Whether, and what type, of synchronous I/O to use for writes. The allowed
+values are:
+.RS
+.RS
+.TP
+.B none
+Do not use synchronous IO, the default.
+.TP
+.B 0
+Same as \fBnone\fR.
+.TP
+.B sync
+Use synchronous file IO. For the majority of I/O engines,
+this means using O_SYNC.
+.TP
+.B 1
+Same as \fBsync\fR.
+.TP
+.B dsync
+Use synchronous data IO. For the majority of I/O engines,
+this means using O_DSYNC.
+.PD
+.RE
+.RE
.TP
.BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr
Fio can use various types of memory as the I/O unit buffer. The allowed
@@ -1537,7 +1654,7 @@ Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to
simulate a smaller amount of memory. The amount specified is per worker.
.SS "I/O size"
.TP
-.BI size \fR=\fPint
+.BI size \fR=\fPint[%|z]
The total size of file I/O for each thread of this job. Fio will run until
this many bytes has been transferred, unless runtime is limited by other options
(such as \fBruntime\fR, for instance, or increased/decreased by \fBio_size\fR).
@@ -1548,11 +1665,11 @@ set to the physical size of the given files or devices if they exist.
If this option is not specified, fio will use the full size of the given
files or devices. If the files do not exist, size must be given. It is also
possible to give size as a percentage between 1 and 100. If `size=20%' is
-given, fio will use 20% of the full size of the given files or devices.
-Can be combined with \fBoffset\fR to constrain the start and end range
-that I/O will be done within.
+given, fio will use 20% of the full size of the given files or devices. In ZBD mode,
+size can be given in units of number of zones using 'z'. Can be combined with \fBoffset\fR to
+constrain the start and end range that I/O will be done within.
.TP
-.BI io_size \fR=\fPint "\fR,\fB io_limit" \fR=\fPint
+.BI io_size \fR=\fPint[%|z] "\fR,\fB io_limit" \fR=\fPint[%|z]
Normally fio operates within the region set by \fBsize\fR, which means
that the \fBsize\fR option sets both the region and size of I/O to be
performed. Sometimes that is not what you want. With this option, it is
@@ -1562,7 +1679,8 @@ will perform I/O within the first 20GiB but exit when 5GiB have been
done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB,
and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within
the 0..20GiB region. Value can be set as percentage: \fBio_size\fR=N%.
-In this case \fBio_size\fR multiplies \fBsize\fR= value.
+In this case \fBio_size\fR multiplies \fBsize\fR= value. In ZBD mode, value can
+also be set as number of zones using 'z'.
.TP
.BI filesize \fR=\fPirange(int)
Individual file sizes. May be a range, in which case fio will select sizes
@@ -1579,11 +1697,10 @@ of a file. This option is ignored on non-regular files.
.TP
.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
Sets size to something really large and waits for ENOSPC (no space left on
-device) as the terminating condition. Only makes sense with sequential
+device) or EDQUOT (disk quota exceeded)
+as the terminating condition. Only makes sense with sequential
write. For a read workload, the mount point will be filled first then I/O
-started on the result. This option doesn't make sense if operating on a raw
-device node, since the size of that is already known by the file system.
-Additionally, writing beyond end-of-device will not return ENOSPC there.
+started on the result.
.SS "I/O engine"
.TP
.BI ioengine \fR=\fPstr
@@ -1668,12 +1785,21 @@ This engine defines engine specific options.
.TP
.B cpuio
Doesn't transfer any data, but burns CPU cycles according to the
-\fBcpuload\fR and \fBcpuchunks\fR options. Setting
-\fBcpuload\fR\=85 will cause that job to do nothing but burn 85%
-of the CPU. In case of SMP machines, use `numjobs=<nr_of_cpu>'
-to get desired CPU usage, as the cpuload only loads a
-single CPU at the desired rate. A job never finishes unless there is
-at least one non-cpuio job.
+\fBcpuload\fR, \fBcpuchunks\fR and \fBcpumode\fR options.
+A job never finishes unless there is at least one non-cpuio job.
+.RS
+.P
+.PD 0
+\fBcpuload\fR\=85 will cause that job to do nothing but burn 85% of the CPU.
+In case of SMP machines, use \fBnumjobs=<nr_of_cpu>\fR\ to get desired CPU usage,
+as the cpuload only loads a single CPU at the desired rate.
+
+.P
+\fBcpumode\fR\=qsort replace the default noop instructions loop
+by a qsort algorithm to consume more energy.
+
+.P
+.RE
.TP
.B rdma
The RDMA I/O engine supports both RDMA memory semantics
@@ -1780,6 +1906,11 @@ Simply do stat() and do no I/O to the file. You need to set 'filesize'
and 'nrfiles', so that files will be created.
This engine is to measure file lookup and meta data access.
.TP
+.B filedelete
+Simply delete files by unlink() and do no I/O to the file. You need to set 'filesize'
+and 'nrfiles', so that files will be created.
+This engine is to measure file delete.
+.TP
.B libpmem
Read and write using mmap I/O to a file on a filesystem
mounted with DAX on a persistent memory device through the PMDK
@@ -1804,6 +1935,26 @@ Read and write iscsi lun with libiscsi.
.TP
.B nbd
Synchronous read and write a Network Block Device (NBD).
+.TP
+.B libcufile
+I/O engine supporting libcufile synchronous access to nvidia-fs and a
+GPUDirect Storage-supported filesystem. This engine performs
+I/O without transferring buffers between user-space and the kernel,
+unless \fBverify\fR is set or \fBcuda_io\fR is \fBposix\fR. \fBiomem\fR must
+not be \fBcudamalloc\fR. This ioengine defines engine specific options.
+.TP
+.B dfs
+I/O engine supporting asynchronous read and write operations to the DAOS File
+System (DFS) via libdfs.
+.TP
+.B nfs
+I/O engine supporting asynchronous read and write operations to
+NFS filesystems from userspace via libnfs. This is useful for
+achieving higher concurrency and thus throughput than is possible
+via kernel NFS.
+.TP
+.B exec
+Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
.SS "I/O engine specific parameters"
In addition, there are some parameters which are only valid when a specific
\fBioengine\fR is in use. These are used identically to normal parameters,
@@ -1900,7 +2051,7 @@ The TCP or UDP port to bind to or connect to. If this is used with
this will be the starting port number since fio will use a range of
ports.
.TP
-.BI (rdma)port
+.BI (rdma, librpma_*)port
The port to use for RDMA-CM communication. This should be the same
value on the client and the server side.
.TP
@@ -1909,6 +2060,16 @@ The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.
If the job is a TCP listener or UDP reader, the hostname is not used
and must be omitted unless it is a valid UDP multicast address.
.TP
+.BI (librpma_*)serverip \fR=\fPstr
+The IP address to be used for RDMA-CM based I/O.
+.TP
+.BI (librpma_*_server)direct_write_to_pmem \fR=\fPbool
+Set to 1 only when Direct Write to PMem from the remote host is possible. Otherwise, set to 0.
+.TP
+.BI (librpma_*_server)busy_wait_polling \fR=\fPbool
+Set to 0 to wait for completion instead of busy-wait polling completion.
+Default: 1.
+.TP
.BI (netsplice,net)interface \fR=\fPstr
The IP address of the network interface used to send or receive UDP
multicast.
@@ -2003,6 +2164,11 @@ by default.
Poll store instead of waiting for completion. Usually this provides better
throughput at cost of higher(up to 100%) CPU utilization.
.TP
+.BI (rados)touch_objects \fR=\fPbool
+During initialization, touch (create if do not exist) all objects (files).
+Touching all objects affects ceph caches and likely impacts test results.
+Enabled by default.
+.TP
.BI (http)http_host \fR=\fPstr
Hostname to connect to. For S3, this could be the bucket name. Default
is \fBlocalhost\fR
@@ -2068,6 +2234,16 @@ client and the server or in certain loopback configurations.
Specify stat system call type to measure lookup/getattr performance.
Default is \fBstat\fR for \fBstat\fR\|(2).
.TP
+.BI (sg)hipri
+If this option is set, fio will attempt to use polled IO completions. This
+will have a similar effect as (io_uring)hipri. Only SCSI READ and WRITE
+commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor VERIFY).
+Older versions of the Linux sg driver that do not support hipri will simply
+ignore this flag and do normal IO. The Linux SCSI Low Level Driver (LLD)
+that "owns" the device also needs to support hipri (also known as iopoll
+and mq_poll). The MegaRAID driver is an example of a SCSI LLD.
+Default: clear (0) which does normal (interrupted based) IO.
+.TP
.BI (sg)readfua \fR=\fPbool
With readfua option set to 1, read operations include the force
unit access (fua) flag. Default: 0.
@@ -2117,7 +2293,79 @@ Example URIs:
\fInbd+unix:///?socket=/tmp/socket\fR
.TP
\fInbds://tlshost/exportname\fR
-
+.RE
+.RE
+.TP
+.BI (libcufile)gpu_dev_ids\fR=\fPstr
+Specify the GPU IDs to use with CUDA. This is a colon-separated list of int.
+GPUs are assigned to workers roundrobin. Default is 0.
+.TP
+.BI (libcufile)cuda_io\fR=\fPstr
+Specify the type of I/O to use with CUDA. This option
+takes the following values:
+.RS
+.RS
+.TP
+.B cufile (default)
+Use libcufile and nvidia-fs. This option performs I/O directly
+between a GPUDirect Storage filesystem and GPU buffers,
+avoiding use of a bounce buffer. If \fBverify\fR is set,
+cudaMemcpy is used to copy verification data between RAM and GPU(s).
+Verification data is copied from RAM to GPU before a write
+and from GPU to RAM after a read.
+\fBdirect\fR must be 1.
+.TP
+.BI posix
+Use POSIX to perform I/O with a RAM buffer, and use
+cudaMemcpy to transfer data between RAM and the GPU(s).
+Data is copied from GPU to RAM before a write and copied
+from RAM to GPU after a read. \fBverify\fR does not affect
+the use of cudaMemcpy.
+.RE
+.RE
+.TP
+.BI (dfs)pool
+Specify the UUID of the DAOS pool to connect to.
+.TP
+.BI (dfs)cont
+Specify the UUID of the DAOS DAOS container to open.
+.TP
+.BI (dfs)chunk_size
+Specificy a different chunk size (in bytes) for the dfs file.
+Use DAOS container's chunk size by default.
+.TP
+.BI (dfs)object_class
+Specificy a different object class for the dfs file.
+Use DAOS container's object class by default.
+.TP
+.BI (nfs)nfs_url
+URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
+Refer to the libnfs README for more details.
+.TP
+.BI (exec)program\fR=\fPstr
+Specify the program to execute.
+Note the program will receive a SIGTERM when the job is reaching the time limit.
+A SIGKILL is sent once the job is over. The delay between the two signals is defined by \fBgrace_time\fR option.
+.TP
+.BI (exec)arguments\fR=\fPstr
+Specify arguments to pass to program.
+Some special variables can be expanded to pass fio's job details to the program :
+.RS
+.RS
+.TP
+.B %r
+replaced by the duration of the job in seconds
+.TP
+.BI %n
+replaced by the name of the job
+.RE
+.RE
+.TP
+.BI (exec)grace_time\fR=\fPint
+Defines the time between the SIGTERM and SIGKILL signals. Default is 1 second.
+.TP
+.BI (exec)std_redirect\fR=\fbool
+If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
.SS "I/O depth"
.TP
.BI iodepth \fR=\fPint
@@ -2238,6 +2486,12 @@ queue depth setting redundant, since no more than 1 I/O will be queued
before we have to complete it and do our \fBthinktime\fR. In other words, this
setting effectively caps the queue depth if the latter is larger.
.TP
+.BI thinktime_blocks_type \fR=\fPstr
+Only valid if \fBthinktime\fR is set - control how \fBthinktime_blocks\fR triggers.
+The default is `complete', which triggers \fBthinktime\fR when fio completes
+\fBthinktime_blocks\fR blocks. If this is set to `issue', then the trigger happens
+at the issue side.
+.TP
.BI rate \fR=\fPint[,int][,int]
Cap the bandwidth used by this job. The number is in bytes/sec, the normal
suffix rules apply. Comma-separated values may be specified for reads,
@@ -2306,10 +2560,11 @@ Used with \fBlatency_target\fR. If false (default), fio will find the highest
queue depth that meets \fBlatency_target\fR and exit. If true, fio will continue
running and try to meet \fBlatency_target\fR by adjusting queue depth.
.TP
-.BI max_latency \fR=\fPtime
+.BI max_latency \fR=\fPtime[,time][,time]
If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
maximum latency. When the unit is omitted, the value is interpreted in
-microseconds.
+microseconds. Comma-separated values may be specified for reads, writes,
+and trims as described in \fBblocksize\fR.
.TP
.BI rate_cycle \fR=\fPint
Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number
@@ -3620,7 +3875,7 @@ Below is a single line containing short names for each of the fields in the
minimal output v3, separated by semicolons:
.P
.nf
- terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+ terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth_kb;read_iops;read_runtime_ms;read_slat_min_us;read_slat_max_us;read_slat_mean_us;read_slat_dev_us;read_clat_min_us;read_clat_max_us;read_clat_mean_us;read_clat_dev_us;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min_us;read_lat_max_us;read_lat_mean_us;read_lat_dev_us;read_bw_min_kb;read_bw_max_kb;read_bw_agg_pct;read_bw_mean_kb;read_bw_dev_kb;write_kb;write_bandwidth_kb;write_iops;write_runtime_ms;write_slat_min_us;write_slat_max_us;write_slat_mean_us;write_slat_dev_us;write_clat_min_us;write_clat_max_us;write_clat_mean_us;write_clat_dev_us;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min_us;write_lat_max_us;write_lat_mean_us;write_lat_dev_us;write_bw_min_kb;write_bw_max_kb;write_bw_agg_pct;write_bw_mean_kb;write_bw_dev_kb;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
.fi
.P
In client/server mode terse output differs from what appears when jobs are run
diff --git a/fio.h b/fio.h
index 9d189eb8..6f6b211b 100644
--- a/fio.h
+++ b/fio.h
@@ -47,6 +47,7 @@
#include "workqueue.h"
#include "steadystate.h"
#include "lib/nowarn_snprintf.h"
+#include "dedupe.h"
#ifdef CONFIG_SOLARISAIO
#include <sys/asynch.h>
@@ -140,6 +141,7 @@ enum {
FIO_RAND_POISSON2_OFF,
FIO_RAND_POISSON3_OFF,
FIO_RAND_PRIO_CMDS,
+ FIO_RAND_DEDUPE_WORKING_SET_IX,
FIO_RAND_NR_OFFS,
};
@@ -149,6 +151,9 @@ enum {
RATE_PROCESS_LINEAR = 0,
RATE_PROCESS_POISSON = 1,
+
+ THINKTIME_BLOCKS_TYPE_COMPLETE = 0,
+ THINKTIME_BLOCKS_TYPE_ISSUE = 1,
};
enum {
@@ -229,6 +234,7 @@ struct thread_data {
double pareto_h;
double gauss_dev;
};
+ double random_center;
int error;
int sig;
int done;
@@ -255,9 +261,14 @@ struct thread_data {
struct frand_state buf_state;
struct frand_state buf_state_prev;
+ struct frand_state buf_state_ret;
struct frand_state dedupe_state;
struct frand_state zone_state;
struct frand_state prio_state;
+ struct frand_state dedupe_working_set_index_state;
+ struct frand_state *dedupe_working_set_states;
+
+ unsigned long long num_unique_pages;
struct zone_split_index **zone_state_index;
unsigned int num_open_zones;
@@ -280,7 +291,6 @@ struct thread_data {
* IO engine private data and dlhandle.
*/
void *io_ops_data;
- void *io_ops_dlhandle;
/*
* Queue depth of io_u's that fio MIGHT do
@@ -354,6 +364,8 @@ struct thread_data {
struct fio_sem *sem;
uint64_t bytes_done[DDIR_RWDIR_CNT];
+ uint64_t *thinktime_blocks_counter;
+
/*
* State for random io, a bitmap of blocks done vs not done
*/
@@ -408,6 +420,7 @@ struct thread_data {
*/
struct flist_head io_log_list;
FILE *io_log_rfile;
+ unsigned int io_log_blktrace;
unsigned int io_log_current;
unsigned int io_log_checkmark;
unsigned int io_log_highmark;
@@ -467,6 +480,12 @@ struct thread_data {
};
+struct thread_segment {
+ struct thread_data *threads;
+ int shm_id;
+ int nr_threads;
+};
+
/*
* when should interactive ETA output be generated
*/
@@ -510,10 +529,15 @@ enum {
#define __fio_stringify_1(x) #x
#define __fio_stringify(x) __fio_stringify_1(x)
+#define REAL_MAX_JOBS 4096
+#define JOBS_PER_SEG 8
+#define REAL_MAX_SEG (REAL_MAX_JOBS / JOBS_PER_SEG)
+
extern bool exitall_on_terminate;
extern unsigned int thread_number;
extern unsigned int stat_number;
-extern int shm_id;
+extern unsigned int nr_segments;
+extern unsigned int cur_segment;
extern int groupid;
extern int output_format;
extern int append_terse_output;
@@ -542,7 +566,15 @@ extern char *trigger_remote_cmd;
extern long long trigger_timeout;
extern char *aux_path;
-extern struct thread_data *threads;
+extern struct thread_segment segments[REAL_MAX_SEG];
+
+static inline struct thread_data *tnumber_to_td(unsigned int tnumber)
+{
+ struct thread_segment *seg;
+
+ seg = &segments[tnumber / JOBS_PER_SEG];
+ return &seg->threads[tnumber & (JOBS_PER_SEG - 1)];
+}
static inline bool is_running_backend(void)
{
@@ -557,8 +589,6 @@ static inline void fio_ro_check(const struct thread_data *td, struct io_u *io_u)
!(io_u->ddir == DDIR_TRIM && !td_trim(td)));
}
-#define REAL_MAX_JOBS 4096
-
static inline bool should_fsync(struct thread_data *td)
{
if (td->last_was_sync)
@@ -709,7 +739,7 @@ extern void lat_target_reset(struct thread_data *);
* Iterates all threads/processes within all the defined jobs
*/
#define for_each_td(td, i) \
- for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++)
+ for ((i) = 0, (td) = &segments[0].threads[0]; (i) < (int) thread_number; (i)++, (td) = tnumber_to_td((i)))
#define for_each_file(td, f, i) \
if ((td)->files_index) \
for ((i) = 0, (f) = (td)->files[0]; \
@@ -739,17 +769,9 @@ static inline bool option_check_rate(struct thread_data *td, enum fio_ddir ddir)
return false;
}
-static inline bool __should_check_rate(struct thread_data *td)
-{
- return (td->flags & TD_F_CHECK_RATE) != 0;
-}
-
static inline bool should_check_rate(struct thread_data *td)
{
- if (!__should_check_rate(td))
- return false;
-
- return ddir_rw_sum(td->bytes_done) != 0;
+ return (td->flags & TD_F_CHECK_RATE) != 0;
}
static inline unsigned long long td_max_bs(struct thread_data *td)
diff --git a/fio_time.h b/fio_time.h
index c00f8e78..b3bbd4c0 100644
--- a/fio_time.h
+++ b/fio_time.h
@@ -13,6 +13,8 @@ extern uint64_t ntime_since(const struct timespec *, const struct timespec *);
extern uint64_t ntime_since_now(const struct timespec *);
extern uint64_t utime_since(const struct timespec *, const struct timespec *);
extern uint64_t utime_since_now(const struct timespec *);
+extern int64_t rel_time_since(const struct timespec *,
+ const struct timespec *);
extern uint64_t mtime_since(const struct timespec *, const struct timespec *);
extern uint64_t mtime_since_now(const struct timespec *);
extern uint64_t mtime_since_tv(const struct timeval *, const struct timeval *);
diff --git a/flow.c b/flow.c
index ee4d761d..c64bb3b2 100644
--- a/flow.c
+++ b/flow.c
@@ -5,9 +5,9 @@
struct fio_flow {
unsigned int refs;
- struct flist_head list;
unsigned int id;
- unsigned long long flow_counter;
+ struct flist_head list;
+ unsigned long flow_counter;
unsigned int total_weight;
};
@@ -37,6 +37,8 @@ int flow_threshold_exceeded(struct thread_data *td)
if (td->o.flow_sleep) {
io_u_quiesce(td);
usleep(td->o.flow_sleep);
+ } else if (td->o.zone_mode == ZONE_MODE_ZBD) {
+ io_u_quiesce(td);
}
return 1;
@@ -90,7 +92,7 @@ static struct fio_flow *flow_get(unsigned int id)
return flow;
}
-static void flow_put(struct fio_flow *flow, unsigned long long flow_counter,
+static void flow_put(struct fio_flow *flow, unsigned long flow_counter,
unsigned int weight)
{
if (!flow_lock)
diff --git a/gclient.c b/gclient.c
index fe83382f..e0e0e7bf 100644
--- a/gclient.c
+++ b/gclient.c
@@ -48,7 +48,7 @@ static GtkActionEntry results_menu_items[] = {
{ "PrintFile", GTK_STOCK_PRINT, "Print", "<Control>P", NULL, G_CALLBACK(results_print) },
{ "CloseFile", GTK_STOCK_CLOSE, "Close", "<Control>W", NULL, G_CALLBACK(results_close) },
};
-static gint results_nmenu_items = ARRAY_SIZE(results_menu_items);
+static gint results_nmenu_items = FIO_ARRAY_SIZE(results_menu_items);
static const gchar *results_ui_string = " \
<ui> \
@@ -755,7 +755,7 @@ static void gfio_show_io_depths(GtkWidget *vbox, struct thread_stat *ts)
GtkListStore *model;
int i;
const char *labels[] = { "Depth", "0", "1", "2", "4", "8", "16", "32", "64", ">= 64" };
- const int nr_labels = ARRAY_SIZE(labels);
+ const int nr_labels = FIO_ARRAY_SIZE(labels);
GType types[nr_labels];
frame = gtk_frame_new("IO depths");
diff --git a/gettime-thread.c b/gettime-thread.c
index 953e4e67..86c2e2ef 100644
--- a/gettime-thread.c
+++ b/gettime-thread.c
@@ -58,7 +58,7 @@ static void *gtod_thread_main(void *data)
* but I'm not sure what to use outside of a simple CPU nop to relax
* it - we don't want to lose precision.
*/
- while (threads) {
+ while (nr_segments) {
fio_gtod_update();
nop;
}
diff --git a/gettime.c b/gettime.c
index c3a4966b..099e9d9f 100644
--- a/gettime.c
+++ b/gettime.c
@@ -127,18 +127,33 @@ static void fio_init gtod_init(void)
#endif /* FIO_DEBUG_TIME */
-#ifdef CONFIG_CLOCK_GETTIME
-static int fill_clock_gettime(struct timespec *ts)
+/*
+ * Queries the value of the monotonic clock if a monotonic clock is available
+ * or the wall clock time if no monotonic clock is available. Returns 0 if
+ * querying the clock succeeded or -1 if querying the clock failed.
+ */
+int fio_get_mono_time(struct timespec *ts)
{
-#if defined(CONFIG_CLOCK_MONOTONIC_RAW)
- return clock_gettime(CLOCK_MONOTONIC_RAW, ts);
-#elif defined(CONFIG_CLOCK_MONOTONIC)
- return clock_gettime(CLOCK_MONOTONIC, ts);
+ int ret;
+
+#ifdef CONFIG_CLOCK_GETTIME
+#if defined(CONFIG_CLOCK_MONOTONIC)
+ ret = clock_gettime(CLOCK_MONOTONIC, ts);
#else
- return clock_gettime(CLOCK_REALTIME, ts);
+ ret = clock_gettime(CLOCK_REALTIME, ts);
#endif
-}
+#else
+ struct timeval tv;
+
+ ret = gettimeofday(&tv, NULL);
+ if (ret == 0) {
+ ts->tv_sec = tv.tv_sec;
+ ts->tv_nsec = tv.tv_usec * 1000;
+ }
#endif
+ assert(ret <= 0);
+ return ret;
+}
static void __fio_gettime(struct timespec *tp)
{
@@ -155,8 +170,8 @@ static void __fio_gettime(struct timespec *tp)
#endif
#ifdef CONFIG_CLOCK_GETTIME
case CS_CGETTIME: {
- if (fill_clock_gettime(tp) < 0) {
- log_err("fio: clock_gettime fails\n");
+ if (fio_get_mono_time(tp) < 0) {
+ log_err("fio: fio_get_mono_time() fails\n");
assert(0);
}
break;
@@ -224,19 +239,13 @@ static unsigned long get_cycles_per_msec(void)
{
struct timespec s, e;
uint64_t c_s, c_e;
- enum fio_cs old_cs = fio_clock_source;
uint64_t elapsed;
-#ifdef CONFIG_CLOCK_GETTIME
- fio_clock_source = CS_CGETTIME;
-#else
- fio_clock_source = CS_GTOD;
-#endif
- __fio_gettime(&s);
+ fio_get_mono_time(&s);
c_s = get_cpu_clock();
do {
- __fio_gettime(&e);
+ fio_get_mono_time(&e);
c_e = get_cpu_clock();
elapsed = ntime_since(&s, &e);
@@ -244,7 +253,6 @@ static unsigned long get_cycles_per_msec(void)
break;
} while (1);
- fio_clock_source = old_cs;
return (c_e - c_s) * 1000000 / elapsed;
}
@@ -516,23 +524,33 @@ uint64_t mtime_since_now(const struct timespec *s)
return mtime_since(s, &t);
}
-uint64_t mtime_since(const struct timespec *s, const struct timespec *e)
+/*
+ * Returns *e - *s in milliseconds as a signed integer. Note: rounding is
+ * asymmetric. If the difference yields +1 ns then 0 is returned. If the
+ * difference yields -1 ns then -1 is returned.
+ */
+int64_t rel_time_since(const struct timespec *s, const struct timespec *e)
{
- int64_t sec, usec;
+ int64_t sec, nsec;
sec = e->tv_sec - s->tv_sec;
- usec = (e->tv_nsec - s->tv_nsec) / 1000;
- if (sec > 0 && usec < 0) {
+ nsec = e->tv_nsec - s->tv_nsec;
+ if (nsec < 0) {
sec--;
- usec += 1000000;
+ nsec += 1000ULL * 1000 * 1000;
}
+ assert(0 <= nsec && nsec < 1000ULL * 1000 * 1000);
- if (sec < 0 || (sec == 0 && usec < 0))
- return 0;
+ return sec * 1000 + nsec / (1000 * 1000);
+}
- sec *= 1000;
- usec /= 1000;
- return sec + usec;
+/*
+ * Returns *e - *s in milliseconds as an unsigned integer. Returns 0 if
+ * *e < *s.
+ */
+uint64_t mtime_since(const struct timespec *s, const struct timespec *e)
+{
+ return max(rel_time_since(s, e), (int64_t)0);
}
uint64_t time_since_now(const struct timespec *s)
@@ -653,12 +671,21 @@ static int clock_cmp(const void *p1, const void *p2)
int fio_monotonic_clocktest(int debug)
{
struct clock_thread *cthreads;
- unsigned int nr_cpus = cpus_online();
+ unsigned int seen_cpus, nr_cpus = cpus_online();
struct clock_entry *entries;
unsigned long nr_entries, tentries, failed = 0;
struct clock_entry *prev, *this;
uint32_t seq = 0;
unsigned int i;
+ os_cpu_mask_t mask;
+
+#ifdef FIO_HAVE_GET_THREAD_AFFINITY
+ fio_get_thread_affinity(mask);
+#else
+ memset(&mask, 0, sizeof(mask));
+ for (i = 0; i < nr_cpus; i++)
+ fio_cpu_set(&mask, i);
+#endif
if (debug) {
log_info("cs: reliable_tsc: %s\n", tsc_reliable ? "yes" : "no");
@@ -685,25 +712,31 @@ int fio_monotonic_clocktest(int debug)
if (debug)
log_info("cs: Testing %u CPUs\n", nr_cpus);
+ seen_cpus = 0;
for (i = 0; i < nr_cpus; i++) {
struct clock_thread *t = &cthreads[i];
+ if (!fio_cpu_isset(&mask, i))
+ continue;
t->cpu = i;
t->debug = debug;
t->seq = &seq;
t->nr_entries = nr_entries;
- t->entries = &entries[i * nr_entries];
+ t->entries = &entries[seen_cpus * nr_entries];
__fio_sem_init(&t->lock, FIO_SEM_LOCKED);
if (pthread_create(&t->thread, NULL, clock_thread_fn, t)) {
failed++;
nr_cpus = i;
break;
}
+ seen_cpus++;
}
for (i = 0; i < nr_cpus; i++) {
struct clock_thread *t = &cthreads[i];
+ if (!fio_cpu_isset(&mask, i))
+ continue;
fio_sem_up(&t->lock);
}
@@ -711,6 +744,8 @@ int fio_monotonic_clocktest(int debug)
struct clock_thread *t = &cthreads[i];
void *ret;
+ if (!fio_cpu_isset(&mask, i))
+ continue;
pthread_join(t->thread, &ret);
if (ret)
failed++;
@@ -724,6 +759,7 @@ int fio_monotonic_clocktest(int debug)
goto err;
}
+ tentries = nr_entries * seen_cpus;
qsort(entries, tentries, sizeof(struct clock_entry), clock_cmp);
/* silence silly gcc */
diff --git a/gettime.h b/gettime.h
index c55f5cba..f1d619ad 100644
--- a/gettime.h
+++ b/gettime.h
@@ -16,6 +16,7 @@ enum fio_cs {
CS_INVAL,
};
+extern int fio_get_mono_time(struct timespec *);
extern void fio_gettime(struct timespec *, void *);
extern void fio_gtod_init(void);
extern void fio_clock_init(void);
diff --git a/gfio.c b/gfio.c
index 734651b6..22c5314d 100644
--- a/gfio.c
+++ b/gfio.c
@@ -1274,7 +1274,7 @@ static GtkActionEntry menu_items[] = {
{ "Quit", GTK_STOCK_QUIT, NULL, "<Control>Q", NULL, G_CALLBACK(quit_clicked) },
{ "About", GTK_STOCK_ABOUT, NULL, NULL, NULL, G_CALLBACK(about_dialog) },
};
-static gint nmenu_items = ARRAY_SIZE(menu_items);
+static gint nmenu_items = FIO_ARRAY_SIZE(menu_items);
static const gchar *ui_string = " \
<ui> \
@@ -1447,7 +1447,7 @@ static GtkWidget *new_client_page(struct gui_entry *ge)
gtk_container_add(GTK_CONTAINER(bottom_align), ge->buttonbox);
gtk_box_pack_start(GTK_BOX(main_vbox), bottom_align, FALSE, FALSE, 0);
- add_buttons(ge, buttonspeclist, ARRAY_SIZE(buttonspeclist));
+ add_buttons(ge, buttonspeclist, FIO_ARRAY_SIZE(buttonspeclist));
/*
* Set up thread status progress bar
diff --git a/goptions.c b/goptions.c
index f44254bf..0b8c56a2 100644
--- a/goptions.c
+++ b/goptions.c
@@ -826,7 +826,7 @@ static struct gopt *gopt_new_str_val(struct gopt_job_view *gjv,
unsigned long long *p, unsigned int idx)
{
struct gopt_str_val *g;
- const gchar *postfix[] = { "B", "KiB", "MiB", "GiB", "PiB", "PiB", "" };
+ const gchar *postfix[] = { "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB" };
GtkWidget *label;
int i;
diff --git a/helper_thread.c b/helper_thread.c
index a2fb7c29..d8e7ebfe 100644
--- a/helper_thread.c
+++ b/helper_thread.c
@@ -1,4 +1,8 @@
#include <signal.h>
+#include <unistd.h>
+#ifdef CONFIG_HAVE_TIMERFD_CREATE
+#include <sys/timerfd.h>
+#endif
#ifdef CONFIG_VALGRIND_DEV
#include <valgrind/drd.h>
#else
@@ -11,6 +15,9 @@
#include "steadystate.h"
#include "pshared.h"
+static int sleep_accuracy_ms;
+static int timerfd = -1;
+
enum action {
A_EXIT = 1,
A_RESET = 2,
@@ -25,6 +32,13 @@ static struct helper_data {
struct fio_sem *startup_sem;
} *helper_data;
+struct interval_timer {
+ const char *name;
+ struct timespec expires;
+ uint32_t interval_ms;
+ int (*func)(void);
+};
+
void helper_thread_destroy(void)
{
if (!helper_data)
@@ -83,6 +97,18 @@ static int read_from_pipe(int fd, void *buf, size_t len)
}
#endif
+static void block_signals(void)
+{
+#ifdef HAVE_PTHREAD_SIGMASK
+ sigset_t sigmask;
+
+ ret = pthread_sigmask(SIG_UNBLOCK, NULL, &sigmask);
+ assert(ret == 0);
+ ret = pthread_sigmask(SIG_BLOCK, &sigmask, NULL);
+ assert(ret == 0);
+#endif
+}
+
static void submit_action(enum action a)
{
const char data = a;
@@ -128,128 +154,207 @@ void helper_thread_exit(void)
pthread_join(helper_data->thread, NULL);
}
-static unsigned int task_helper(struct timespec *last, struct timespec *now, unsigned int period, void do_task())
+/* Resets timers and returns the time in milliseconds until the next event. */
+static int reset_timers(struct interval_timer timer[], int num_timers,
+ struct timespec *now)
{
- unsigned int next, since;
-
- since = mtime_since(last, now);
- if (since >= period || period - since < 10) {
- do_task();
- timespec_add_msec(last, since);
- if (since > period)
- next = period - (since - period);
- else
- next = period;
- } else
- next = period - since;
-
- return next;
+ uint32_t msec_to_next_event = INT_MAX;
+ int i;
+
+ for (i = 0; i < num_timers; ++i) {
+ timer[i].expires = *now;
+ timespec_add_msec(&timer[i].expires, timer[i].interval_ms);
+ msec_to_next_event = min_not_zero(msec_to_next_event,
+ timer[i].interval_ms);
+ }
+
+ return msec_to_next_event;
}
-static void *helper_thread_main(void *data)
+/*
+ * Waits for an action from fd during at least timeout_ms. `fd` must be in
+ * non-blocking mode.
+ */
+static uint8_t wait_for_action(int fd, unsigned int timeout_ms)
{
- struct helper_data *hd = data;
- unsigned int msec_to_next_event, next_log, next_si = status_interval;
- unsigned int next_ss = STEADYSTATE_MSEC;
- struct timespec ts, last_du, last_ss, last_si;
- char action;
- int ret = 0;
-
- sk_out_assign(hd->sk_out);
+ struct timeval timeout = {
+ .tv_sec = timeout_ms / 1000,
+ .tv_usec = (timeout_ms % 1000) * 1000,
+ };
+ fd_set rfds, efds;
+ uint8_t action = 0;
+ uint64_t exp;
+ int res;
-#ifdef HAVE_PTHREAD_SIGMASK
+ res = read_from_pipe(fd, &action, sizeof(action));
+ if (res > 0 || timeout_ms == 0)
+ return action;
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ FD_ZERO(&efds);
+ FD_SET(fd, &efds);
+#ifdef CONFIG_HAVE_TIMERFD_CREATE
{
- sigset_t sigmask;
-
- /* Let another thread handle signals. */
- ret = pthread_sigmask(SIG_UNBLOCK, NULL, &sigmask);
- assert(ret == 0);
- ret = pthread_sigmask(SIG_BLOCK, &sigmask, NULL);
- assert(ret == 0);
+ /*
+ * If the timer frequency is 100 Hz, select() will round up
+ * `timeout` to the next multiple of 1 / 100 Hz = 10 ms. Hence
+ * use a high-resolution timer if possible to increase
+ * select() timeout accuracy.
+ */
+ struct itimerspec delta = {};
+
+ delta.it_value.tv_sec = timeout.tv_sec;
+ delta.it_value.tv_nsec = timeout.tv_usec * 1000;
+ res = timerfd_settime(timerfd, 0, &delta, NULL);
+ assert(res == 0);
+ FD_SET(timerfd, &rfds);
}
#endif
+ res = select(max(fd, timerfd) + 1, &rfds, NULL, &efds,
+ timerfd >= 0 ? NULL : &timeout);
+ if (res < 0) {
+ log_err("fio: select() call in helper thread failed: %s",
+ strerror(errno));
+ return A_EXIT;
+ }
+ if (FD_ISSET(fd, &rfds))
+ read_from_pipe(fd, &action, sizeof(action));
+ if (timerfd >= 0 && FD_ISSET(timerfd, &rfds)) {
+ res = read(timerfd, &exp, sizeof(exp));
+ assert(res == sizeof(exp));
+ }
+ return action;
+}
+
+/*
+ * Verify whether or not timer @it has expired. If timer @it has expired, call
+ * @it->func(). @now is the current time. @msec_to_next_event is an
+ * input/output parameter that represents the time until the next event.
+ */
+static int eval_timer(struct interval_timer *it, const struct timespec *now,
+ unsigned int *msec_to_next_event)
+{
+ int64_t delta_ms;
+ bool expired;
+
+ /* interval == 0 means that the timer is disabled. */
+ if (it->interval_ms == 0)
+ return 0;
+
+ delta_ms = rel_time_since(now, &it->expires);
+ expired = delta_ms <= sleep_accuracy_ms;
+ if (expired) {
+ timespec_add_msec(&it->expires, it->interval_ms);
+ delta_ms = rel_time_since(now, &it->expires);
+ if (delta_ms < it->interval_ms - sleep_accuracy_ms ||
+ delta_ms > it->interval_ms + sleep_accuracy_ms) {
+ dprint(FD_HELPERTHREAD,
+ "%s: delta = %" PRIi64 " <> %u. Clock jump?\n",
+ it->name, delta_ms, it->interval_ms);
+ delta_ms = it->interval_ms;
+ it->expires = *now;
+ timespec_add_msec(&it->expires, it->interval_ms);
+ }
+ }
+ *msec_to_next_event = min((unsigned int)delta_ms, *msec_to_next_event);
+ return expired ? it->func() : 0;
+}
+
+static void *helper_thread_main(void *data)
+{
+ struct helper_data *hd = data;
+ unsigned int msec_to_next_event, next_log;
+ struct interval_timer timer[] = {
+ {
+ .name = "disk_util",
+ .interval_ms = DISK_UTIL_MSEC,
+ .func = update_io_ticks,
+ },
+ {
+ .name = "status_interval",
+ .interval_ms = status_interval,
+ .func = __show_running_run_stats,
+ },
+ {
+ .name = "steadystate",
+ .interval_ms = steadystate_enabled ? STEADYSTATE_MSEC :
+ 0,
+ .func = steadystate_check,
+ }
+ };
+ struct timespec ts;
+ int clk_tck, ret = 0;
-#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
- clock_gettime(CLOCK_MONOTONIC, &ts);
+#ifdef _SC_CLK_TCK
+ clk_tck = sysconf(_SC_CLK_TCK);
#else
- clock_gettime(CLOCK_REALTIME, &ts);
+ /*
+ * The timer frequence is variable on Windows. Instead of trying to
+ * query it, use 64 Hz, the clock frequency lower bound. See also
+ * https://carpediemsystems.co.uk/2019/07/18/windows-system-timer-granularity/.
+ */
+ clk_tck = 64;
+#endif
+ dprint(FD_HELPERTHREAD, "clk_tck = %d\n", clk_tck);
+ assert(clk_tck > 0);
+ sleep_accuracy_ms = (1000 + clk_tck - 1) / clk_tck;
+
+#ifdef CONFIG_HAVE_TIMERFD_CREATE
+ timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+ assert(timerfd >= 0);
+ sleep_accuracy_ms = 1;
#endif
- memcpy(&last_du, &ts, sizeof(ts));
- memcpy(&last_ss, &ts, sizeof(ts));
- memcpy(&last_si, &ts, sizeof(ts));
+
+ sk_out_assign(hd->sk_out);
+
+ /* Let another thread handle signals. */
+ block_signals();
+
+ fio_get_mono_time(&ts);
+ msec_to_next_event = reset_timers(timer, FIO_ARRAY_SIZE(timer), &ts);
fio_sem_up(hd->startup_sem);
- msec_to_next_event = DISK_UTIL_MSEC;
while (!ret && !hd->exit) {
- uint64_t since_du;
- struct timeval timeout = {
- .tv_sec = msec_to_next_event / 1000,
- .tv_usec = (msec_to_next_event % 1000) * 1000,
- };
- fd_set rfds, efds;
-
- if (read_from_pipe(hd->pipe[0], &action, sizeof(action)) < 0) {
- FD_ZERO(&rfds);
- FD_SET(hd->pipe[0], &rfds);
- FD_ZERO(&efds);
- FD_SET(hd->pipe[0], &efds);
- if (select(1, &rfds, NULL, &efds, &timeout) < 0) {
- log_err("fio: select() call in helper thread failed: %s",
- strerror(errno));
- ret = 1;
- }
- if (read_from_pipe(hd->pipe[0], &action, sizeof(action)) <
- 0)
- action = 0;
- }
+ uint8_t action;
+ int i;
-#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
- clock_gettime(CLOCK_MONOTONIC, &ts);
-#else
- clock_gettime(CLOCK_REALTIME, &ts);
-#endif
+ action = wait_for_action(hd->pipe[0], msec_to_next_event);
+ if (action == A_EXIT)
+ break;
- if (action == A_RESET) {
- last_du = ts;
- last_ss = ts;
- }
+ fio_get_mono_time(&ts);
+
+ msec_to_next_event = INT_MAX;
- since_du = mtime_since(&last_du, &ts);
- if (since_du >= DISK_UTIL_MSEC || DISK_UTIL_MSEC - since_du < 10) {
- ret = update_io_ticks();
- timespec_add_msec(&last_du, DISK_UTIL_MSEC);
- msec_to_next_event = DISK_UTIL_MSEC;
- if (since_du >= DISK_UTIL_MSEC)
- msec_to_next_event -= (since_du - DISK_UTIL_MSEC);
- } else
- msec_to_next_event = DISK_UTIL_MSEC - since_du;
+ if (action == A_RESET)
+ msec_to_next_event = reset_timers(timer,
+ FIO_ARRAY_SIZE(timer), &ts);
+
+ for (i = 0; i < FIO_ARRAY_SIZE(timer); ++i)
+ ret = eval_timer(&timer[i], &ts, &msec_to_next_event);
if (action == A_DO_STAT)
__show_running_run_stats();
- if (status_interval) {
- next_si = task_helper(&last_si, &ts, status_interval, __show_running_run_stats);
- msec_to_next_event = min(next_si, msec_to_next_event);
- }
-
next_log = calc_log_samples();
if (!next_log)
next_log = DISK_UTIL_MSEC;
- if (steadystate_enabled) {
- next_ss = task_helper(&last_ss, &ts, STEADYSTATE_MSEC, steadystate_check);
- msec_to_next_event = min(next_ss, msec_to_next_event);
- }
-
msec_to_next_event = min(next_log, msec_to_next_event);
- dprint(FD_HELPERTHREAD, "next_si: %u, next_ss: %u, next_log: %u, msec_to_next_event: %u\n",
- next_si, next_ss, next_log, msec_to_next_event);
+ dprint(FD_HELPERTHREAD,
+ "next_log: %u, msec_to_next_event: %u\n",
+ next_log, msec_to_next_event);
if (!is_backend)
print_thread_status();
}
+ if (timerfd >= 0) {
+ close(timerfd);
+ timerfd = -1;
+ }
+
fio_writeout_logs(false);
sk_out_drop();
diff --git a/init.c b/init.c
index 7f64ce21..871fb5ad 100644
--- a/init.c
+++ b/init.c
@@ -45,13 +45,12 @@ const char fio_version_string[] = FIO_VERSION;
#define FIO_RANDSEED (0xb1899bedUL)
static char **ini_file;
-static int max_jobs = FIO_MAX_JOBS;
static bool dump_cmdline;
static bool parse_only;
static bool merge_blktrace_only;
static struct thread_data def_thread;
-struct thread_data *threads = NULL;
+struct thread_segment segments[REAL_MAX_SEG];
static char **job_sections;
static int nr_job_sections;
@@ -301,25 +300,35 @@ static struct option l_opts[FIO_NR_OPTIONS] = {
void free_threads_shm(void)
{
- if (threads) {
- void *tp = threads;
+ int i;
+
+ for (i = 0; i < nr_segments; i++) {
+ struct thread_segment *seg = &segments[i];
+
+ if (seg->threads) {
+ void *tp = seg->threads;
#ifndef CONFIG_NO_SHM
- struct shmid_ds sbuf;
+ struct shmid_ds sbuf;
- threads = NULL;
- shmdt(tp);
- shmctl(shm_id, IPC_RMID, &sbuf);
- shm_id = -1;
+ seg->threads = NULL;
+ shmdt(tp);
+ shmctl(seg->shm_id, IPC_RMID, &sbuf);
+ seg->shm_id = -1;
#else
- threads = NULL;
- free(tp);
+ seg->threads = NULL;
+ free(tp);
#endif
+ }
}
+
+ nr_segments = 0;
+ cur_segment = 0;
}
static void free_shm(void)
{
- if (threads) {
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ if (nr_segments) {
flow_exit();
fio_debug_jobp = NULL;
fio_warned = NULL;
@@ -335,73 +344,82 @@ static void free_shm(void)
fio_filelock_exit();
file_hash_exit();
scleanup();
+#endif
}
-/*
- * The thread area is shared between the main process and the job
- * threads/processes. So setup a shared memory segment that will hold
- * all the job info. We use the end of the region for keeping track of
- * open files across jobs, for file sharing.
- */
-static int setup_thread_area(void)
+static int add_thread_segment(void)
{
+ struct thread_segment *seg = &segments[nr_segments];
+ size_t size = JOBS_PER_SEG * sizeof(struct thread_data);
int i;
- if (threads)
- return 0;
-
- /*
- * 1024 is too much on some machines, scale max_jobs if
- * we get a failure that looks like too large a shm segment
- */
- do {
- size_t size = max_jobs * sizeof(struct thread_data);
+ if (nr_segments + 1 >= REAL_MAX_SEG) {
+ log_err("error: maximum number of jobs reached.\n");
+ return -1;
+ }
- size += 2 * sizeof(unsigned int);
+ size += 2 * sizeof(unsigned int);
#ifndef CONFIG_NO_SHM
- shm_id = shmget(0, size, IPC_CREAT | 0600);
- if (shm_id != -1)
- break;
- if (errno != EINVAL && errno != ENOMEM && errno != ENOSPC) {
+ seg->shm_id = shmget(0, size, IPC_CREAT | 0600);
+ if (seg->shm_id == -1) {
+ if (errno != EINVAL && errno != ENOMEM && errno != ENOSPC)
perror("shmget");
- break;
- }
+ return -1;
+ }
#else
- threads = malloc(size);
- if (threads)
- break;
+ seg->threads = malloc(size);
+ if (!seg->threads)
+ return -1;
#endif
- max_jobs >>= 1;
- } while (max_jobs);
-
#ifndef CONFIG_NO_SHM
- if (shm_id == -1)
- return 1;
-
- threads = shmat(shm_id, NULL, 0);
- if (threads == (void *) -1) {
+ seg->threads = shmat(seg->shm_id, NULL, 0);
+ if (seg->threads == (void *) -1) {
perror("shmat");
return 1;
}
if (shm_attach_to_open_removed())
- shmctl(shm_id, IPC_RMID, NULL);
+ shmctl(seg->shm_id, IPC_RMID, NULL);
#endif
- memset(threads, 0, max_jobs * sizeof(struct thread_data));
- for (i = 0; i < max_jobs; i++)
- DRD_IGNORE_VAR(threads[i]);
- fio_debug_jobp = (unsigned int *)(threads + max_jobs);
+ nr_segments++;
+
+ memset(seg->threads, 0, JOBS_PER_SEG * sizeof(struct thread_data));
+ for (i = 0; i < JOBS_PER_SEG; i++)
+ DRD_IGNORE_VAR(seg->threads[i]);
+ seg->nr_threads = 0;
+
+ /* Not first segment, we're done */
+ if (nr_segments != 1) {
+ cur_segment++;
+ return 0;
+ }
+
+ fio_debug_jobp = (unsigned int *)(seg->threads + JOBS_PER_SEG);
*fio_debug_jobp = -1;
fio_warned = fio_debug_jobp + 1;
*fio_warned = 0;
flow_init();
-
return 0;
}
+/*
+ * The thread areas are shared between the main process and the job
+ * threads/processes, and is split into chunks of JOBS_PER_SEG. If the current
+ * segment has no more room, add a new chunk.
+ */
+static int expand_thread_area(void)
+{
+ struct thread_segment *seg = &segments[cur_segment];
+
+ if (nr_segments && seg->nr_threads < JOBS_PER_SEG)
+ return 0;
+
+ return add_thread_segment();
+}
+
static void dump_print_option(struct print_option *p)
{
const char *delim;
@@ -430,19 +448,6 @@ static void dump_opt_list(struct thread_data *td)
}
}
-static void fio_dump_options_free(struct thread_data *td)
-{
- while (!flist_empty(&td->opt_list)) {
- struct print_option *p;
-
- p = flist_first_entry(&td->opt_list, struct print_option, list);
- flist_del_init(&p->list);
- free(p->name);
- free(p->value);
- free(p);
- }
-}
-
static void copy_opt_list(struct thread_data *dst, struct thread_data *src)
{
struct flist_head *entry;
@@ -470,21 +475,19 @@ static void copy_opt_list(struct thread_data *dst, struct thread_data *src)
static struct thread_data *get_new_job(bool global, struct thread_data *parent,
bool preserve_eo, const char *jobname)
{
+ struct thread_segment *seg;
struct thread_data *td;
if (global)
return &def_thread;
- if (setup_thread_area()) {
+ if (expand_thread_area()) {
log_err("error: failed to setup shm segment\n");
return NULL;
}
- if (thread_number >= max_jobs) {
- log_err("error: maximum number of jobs (%d) reached.\n",
- max_jobs);
- return NULL;
- }
- td = &threads[thread_number++];
+ seg = &segments[cur_segment];
+ td = &seg->threads[seg->nr_threads++];
+ thread_number++;
*td = *parent;
INIT_FLIST_HEAD(&td->opt_list);
@@ -534,7 +537,8 @@ static void put_job(struct thread_data *td)
if (td->o.name)
free(td->o.name);
- memset(&threads[td->thread_number - 1], 0, sizeof(*td));
+ memset(td, 0, sizeof(*td));
+ segments[cur_segment].nr_threads--;
thread_number--;
}
@@ -629,6 +633,11 @@ static int fixup_options(struct thread_data *td)
ret |= 1;
}
+ if (o->zone_mode == ZONE_MODE_ZBD && !o->create_serialize) {
+ log_err("fio: --zonemode=zbd and --create_serialize=0 are not compatible.\n");
+ ret |= 1;
+ }
+
if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_size) {
log_err("fio: --zonesize must be specified when using --zonemode=strided.\n");
ret |= 1;
@@ -944,9 +953,33 @@ static int fixup_options(struct thread_data *td)
/*
* Fix these up to be nsec internally
*/
- o->max_latency *= 1000ULL;
+ for_each_rw_ddir(ddir)
+ o->max_latency[ddir] *= 1000ULL;
+
o->latency_target *= 1000ULL;
+ /*
+ * Dedupe working set verifications
+ */
+ if (o->dedupe_percentage && o->dedupe_mode == DEDUPE_MODE_WORKING_SET) {
+ if (!fio_option_is_set(o, size)) {
+ log_err("fio: pregenerated dedupe working set "
+ "requires size to be set\n");
+ ret |= 1;
+ } else if (o->nr_files != 1) {
+ log_err("fio: dedupe working set mode supported with "
+ "single file per job, but %d files "
+ "provided\n", o->nr_files);
+ ret |= 1;
+ } else if (o->dedupe_working_set_percentage + o->dedupe_percentage > 100) {
+ log_err("fio: impossible to reach expected dedupe percentage %u "
+ "since %u percentage of size is reserved to dedupe working set "
+ "(those are unique pages)\n",
+ o->dedupe_percentage, o->dedupe_working_set_percentage);
+ ret |= 1;
+ }
+ }
+
return ret;
}
@@ -956,13 +989,13 @@ static void init_rand_file_service(struct thread_data *td)
const unsigned int seed = td->rand_seeds[FIO_RAND_FILE_OFF];
if (td->o.file_service_type == FIO_FSERVICE_ZIPF) {
- zipf_init(&td->next_file_zipf, nranges, td->zipf_theta, seed);
+ zipf_init(&td->next_file_zipf, nranges, td->zipf_theta, td->random_center, seed);
zipf_disable_hash(&td->next_file_zipf);
} else if (td->o.file_service_type == FIO_FSERVICE_PARETO) {
- pareto_init(&td->next_file_zipf, nranges, td->pareto_h, seed);
+ pareto_init(&td->next_file_zipf, nranges, td->pareto_h, td->random_center, seed);
zipf_disable_hash(&td->next_file_zipf);
} else if (td->o.file_service_type == FIO_FSERVICE_GAUSS) {
- gauss_init(&td->next_file_gauss, nranges, td->gauss_dev, seed);
+ gauss_init(&td->next_file_gauss, nranges, td->gauss_dev, td->random_center, seed);
gauss_disable_hash(&td->next_file_gauss);
}
}
@@ -1020,6 +1053,7 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
init_rand_seed(&td->prio_state, td->rand_seeds[FIO_RAND_PRIO_CMDS], false);
+ init_rand_seed(&td->dedupe_working_set_index_state, td->rand_seeds[FIO_RAND_DEDUPE_WORKING_SET_IX], use64);
if (!td_random(td))
return;
@@ -1087,18 +1121,15 @@ int ioengine_load(struct thread_data *td)
* for this name and see if they match. If they do, then
* the engine is unchanged.
*/
- dlhandle = td->io_ops_dlhandle;
+ dlhandle = td->io_ops->dlhandle;
ops = load_ioengine(td);
if (!ops)
goto fail;
- if (ops == td->io_ops && dlhandle == td->io_ops_dlhandle) {
- if (dlhandle)
- dlclose(dlhandle);
+ if (ops == td->io_ops && dlhandle == td->io_ops->dlhandle)
return 0;
- }
- if (dlhandle && dlhandle != td->io_ops_dlhandle)
+ if (dlhandle && dlhandle != td->io_ops->dlhandle)
dlclose(dlhandle);
/* Unload the old engine. */
@@ -1224,7 +1255,8 @@ enum {
FPRE_NONE = 0,
FPRE_JOBNAME,
FPRE_JOBNUM,
- FPRE_FILENUM
+ FPRE_FILENUM,
+ FPRE_CLIENTUID
};
static struct fpre_keyword {
@@ -1235,6 +1267,7 @@ static struct fpre_keyword {
{ .keyword = "$jobname", .key = FPRE_JOBNAME, },
{ .keyword = "$jobnum", .key = FPRE_JOBNUM, },
{ .keyword = "$filenum", .key = FPRE_FILENUM, },
+ { .keyword = "$clientuid", .key = FPRE_CLIENTUID, },
{ .keyword = NULL, },
};
@@ -1324,6 +1357,21 @@ static char *make_filename(char *buf, size_t buf_size,struct thread_options *o,
}
break;
}
+ case FPRE_CLIENTUID: {
+ int ret;
+ ret = snprintf(dst, dst_left, "%s", client_sockaddr_str);
+ if (ret < 0)
+ break;
+ else if (ret > dst_left) {
+ log_err("fio: truncated filename\n");
+ dst += dst_left;
+ dst_left = 0;
+ } else {
+ dst += ret;
+ dst_left -= ret;
+ }
+ break;
+ }
default:
assert(0);
break;
@@ -1466,6 +1514,9 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
if (fixup_options(td))
goto err;
+ if (init_dedupe_working_set_seeds(td))
+ goto err;
+
/*
* Belongs to fixup_options, but o->name is not necessarily set as yet
*/
@@ -2722,12 +2773,7 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
warnings_fatal = 1;
break;
case 'j':
- max_jobs = atoi(optarg);
- if (!max_jobs || max_jobs > REAL_MAX_JOBS) {
- log_err("fio: invalid max jobs: %d\n", max_jobs);
- do_exit++;
- exit_val = 1;
- }
+ /* we don't track/need this anymore, ignore it */
break;
case 'S':
did_arg = true;
diff --git a/io_u.c b/io_u.c
index f30fc037..9a1cd547 100644
--- a/io_u.c
+++ b/io_u.c
@@ -1326,8 +1326,10 @@ static struct fio_file *__get_next_file(struct thread_data *td)
if (f && fio_file_open(f) && !fio_file_closing(f)) {
if (td->o.file_service_type == FIO_FSERVICE_SEQ)
goto out;
- if (td->file_service_left--)
- goto out;
+ if (td->file_service_left) {
+ td->file_service_left--;
+ goto out;
+ }
}
if (td->o.file_service_type == FIO_FSERVICE_RR ||
@@ -1387,11 +1389,16 @@ static long set_io_u_file(struct thread_data *td, struct io_u *io_u)
return 0;
}
-static void lat_fatal(struct thread_data *td, struct io_completion_data *icd,
+static void lat_fatal(struct thread_data *td, struct io_u *io_u, struct io_completion_data *icd,
unsigned long long tnsec, unsigned long long max_nsec)
{
- if (!td->error)
- log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec)\n", tnsec, max_nsec);
+ if (!td->error) {
+ log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec): %s %s %llu %llu\n",
+ tnsec, max_nsec,
+ io_u->file->file_name,
+ io_ddir_name(io_u->ddir),
+ io_u->offset, io_u->buflen);
+ }
td_verror(td, ETIMEDOUT, "max latency exceeded");
icd->error = ETIMEDOUT;
}
@@ -1886,11 +1893,13 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
icd->error = ops->io_u_lat(td, tnsec);
}
- if (td->o.max_latency && tnsec > td->o.max_latency)
- lat_fatal(td, icd, tnsec, td->o.max_latency);
- if (td->o.latency_target && tnsec > td->o.latency_target) {
- if (lat_target_failed(td))
- lat_fatal(td, icd, tnsec, td->o.latency_target);
+ if (ddir_rw(idx)) {
+ if (td->o.max_latency[idx] && tnsec > td->o.max_latency[idx])
+ lat_fatal(td, io_u, icd, tnsec, td->o.max_latency[idx]);
+ if (td->o.latency_target && tnsec > td->o.latency_target) {
+ if (lat_target_failed(td))
+ lat_fatal(td, io_u, icd, tnsec, td->o.latency_target);
+ }
}
}
@@ -2163,6 +2172,7 @@ void io_u_queued(struct thread_data *td, struct io_u *io_u)
static struct frand_state *get_buf_state(struct thread_data *td)
{
unsigned int v;
+ unsigned long long i;
if (!td->o.dedupe_percentage)
return &td->buf_state;
@@ -2174,7 +2184,24 @@ static struct frand_state *get_buf_state(struct thread_data *td)
v = rand_between(&td->dedupe_state, 1, 100);
if (v <= td->o.dedupe_percentage)
- return &td->buf_state_prev;
+ switch (td->o.dedupe_mode) {
+ case DEDUPE_MODE_REPEAT:
+ /*
+ * The caller advances the returned frand_state.
+ * A copy of prev should be returned instead since
+ * a subsequent intention to generate a deduped buffer
+ * might result in generating a unique one
+ */
+ frand_copy(&td->buf_state_ret, &td->buf_state_prev);
+ return &td->buf_state_ret;
+ case DEDUPE_MODE_WORKING_SET:
+ i = rand_between(&td->dedupe_working_set_index_state, 0, td->num_unique_pages - 1);
+ frand_copy(&td->buf_state_ret, &td->dedupe_working_set_states[i]);
+ return &td->buf_state_ret;
+ default:
+ log_err("unexpected dedupe mode %u\n", td->o.dedupe_mode);
+ assert(0);
+ }
return &td->buf_state;
}
diff --git a/ioengines.c b/ioengines.c
index d3be8026..d08a511a 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -15,6 +15,8 @@
#include <dlfcn.h>
#include <fcntl.h>
#include <assert.h>
+#include <sys/types.h>
+#include <dirent.h>
#include "fio.h"
#include "diskutil.h"
@@ -45,7 +47,7 @@ static bool check_engine_ops(struct thread_data *td, struct ioengine_ops *ops)
* async engines aren't reliable with offload
*/
if ((td->o.io_submit_mode == IO_MODE_OFFLOAD) &&
- !(ops->flags & FIO_FAKEIO)) {
+ (ops->flags & FIO_NO_OFFLOAD)) {
log_err("%s: can't be used with offloaded submit. Use a sync "
"engine\n", ops->name);
return true;
@@ -91,8 +93,9 @@ static void *dlopen_external(struct thread_data *td, const char *engine)
char engine_path[PATH_MAX];
void *dlhandle;
- sprintf(engine_path, "%s/lib%s.so", FIO_EXT_ENG_DIR, engine);
+ sprintf(engine_path, "%s/fio-%s.so", FIO_EXT_ENG_DIR, engine);
+ dprint(FD_IO, "dlopen external %s\n", engine_path);
dlhandle = dlopen(engine_path, RTLD_LAZY);
if (!dlhandle)
log_info("Engine %s not found; Either name is invalid, was not built, or fio-engine-%s package is missing.\n",
@@ -110,7 +113,11 @@ static struct ioengine_ops *dlopen_ioengine(struct thread_data *td,
struct ioengine_ops *ops;
void *dlhandle;
- dprint(FD_IO, "dload engine %s\n", engine_lib);
+ if (!strncmp(engine_lib, "linuxaio", 8) ||
+ !strncmp(engine_lib, "aio", 3))
+ engine_lib = "libaio";
+
+ dprint(FD_IO, "dlopen engine %s\n", engine_lib);
dlerror();
dlhandle = dlopen(engine_lib, RTLD_LAZY);
@@ -149,7 +156,7 @@ static struct ioengine_ops *dlopen_ioengine(struct thread_data *td,
return NULL;
}
- td->io_ops_dlhandle = dlhandle;
+ ops->dlhandle = dlhandle;
return ops;
}
@@ -158,7 +165,7 @@ static struct ioengine_ops *__load_ioengine(const char *engine)
/*
* linux libaio has alias names, so convert to what we want
*/
- if (!strncmp(engine, "linuxaio", 8)) {
+ if (!strncmp(engine, "linuxaio", 8) || !strncmp(engine, "aio", 3)) {
dprint(FD_IO, "converting ioengine name: %s -> libaio\n",
engine);
engine = "libaio";
@@ -188,7 +195,9 @@ struct ioengine_ops *load_ioengine(struct thread_data *td)
* so as not to break job files not using the prefix.
*/
ops = __load_ioengine(td->o.ioengine);
- if (!ops)
+
+ /* We do re-dlopen existing handles, for reference counting */
+ if (!ops || ops->dlhandle)
ops = dlopen_ioengine(td, name);
/*
@@ -222,9 +231,9 @@ void free_ioengine(struct thread_data *td)
td->eo = NULL;
}
- if (td->io_ops_dlhandle) {
- dlclose(td->io_ops_dlhandle);
- td->io_ops_dlhandle = NULL;
+ if (td->io_ops->dlhandle) {
+ dprint(FD_IO, "dlclose ioengine %s\n", td->io_ops->name);
+ dlclose(td->io_ops->dlhandle);
}
td->io_ops = NULL;
@@ -404,7 +413,6 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
if (!td->io_ops->commit) {
io_u_mark_submit(td, 1);
io_u_mark_complete(td, 1);
- zbd_put_io_u(td, io_u);
}
if (ret == FIO_Q_COMPLETED) {
@@ -630,6 +638,34 @@ int td_io_get_file_size(struct thread_data *td, struct fio_file *f)
return td->io_ops->get_file_size(td, f);
}
+#ifdef CONFIG_DYNAMIC_ENGINES
+/* Load all dynamic engines in FIO_EXT_ENG_DIR for enghelp command */
+static void
+fio_load_dynamic_engines(struct thread_data *td)
+{
+ DIR *dirhandle = NULL;
+ struct dirent *dirent = NULL;
+ char engine_path[PATH_MAX];
+
+ dirhandle = opendir(FIO_EXT_ENG_DIR);
+ if (!dirhandle)
+ return;
+
+ while ((dirent = readdir(dirhandle)) != NULL) {
+ if (!strcmp(dirent->d_name, ".") ||
+ !strcmp(dirent->d_name, ".."))
+ continue;
+
+ sprintf(engine_path, "%s/%s", FIO_EXT_ENG_DIR, dirent->d_name);
+ dlopen_ioengine(td, engine_path);
+ }
+
+ closedir(dirhandle);
+}
+#else
+#define fio_load_dynamic_engines(td) do { } while (0)
+#endif
+
int fio_show_ioengine_help(const char *engine)
{
struct flist_head *entry;
@@ -638,8 +674,11 @@ int fio_show_ioengine_help(const char *engine)
char *sep;
int ret = 1;
+ memset(&td, 0, sizeof(struct thread_data));
+
if (!engine || !*engine) {
log_info("Available IO engines:\n");
+ fio_load_dynamic_engines(&td);
flist_for_each(entry, &engine_list) {
io_ops = flist_entry(entry, struct ioengine_ops, list);
log_info("\t%s\n", io_ops->name);
@@ -652,19 +691,18 @@ int fio_show_ioengine_help(const char *engine)
sep++;
}
- memset(&td, 0, sizeof(struct thread_data));
td.o.ioengine = (char *)engine;
- io_ops = load_ioengine(&td);
+ td.io_ops = load_ioengine(&td);
- if (!io_ops) {
+ if (!td.io_ops) {
log_info("IO engine %s not found\n", engine);
return 1;
}
- if (io_ops->options)
- ret = show_cmd_help(io_ops->options, sep);
+ if (td.io_ops->options)
+ ret = show_cmd_help(td.io_ops->options, sep);
else
- log_info("IO engine %s has no options\n", io_ops->name);
+ log_info("IO engine %s has no options\n", td.io_ops->name);
free_ioengine(&td);
return ret;
diff --git a/ioengines.h b/ioengines.h
index 54dadba2..b3f755b4 100644
--- a/ioengines.h
+++ b/ioengines.h
@@ -8,7 +8,7 @@
#include "io_u.h"
#include "zbd_types.h"
-#define FIO_IOOPS_VERSION 26
+#define FIO_IOOPS_VERSION 30
#ifndef CONFIG_DYNAMIC_ENGINES
#define FIO_STATIC static
@@ -30,6 +30,7 @@ struct ioengine_ops {
const char *name;
int version;
int flags;
+ void *dlhandle;
int (*setup)(struct thread_data *);
int (*init)(struct thread_data *);
int (*post_init)(struct thread_data *);
@@ -46,6 +47,7 @@ struct ioengine_ops {
int (*invalidate)(struct thread_data *, struct fio_file *);
int (*unlink_file)(struct thread_data *, struct fio_file *);
int (*get_file_size)(struct thread_data *, struct fio_file *);
+ int (*prepopulate_file)(struct thread_data *, struct fio_file *);
void (*terminate)(struct thread_data *);
int (*iomem_alloc)(struct thread_data *, size_t);
void (*iomem_free)(struct thread_data *);
@@ -57,6 +59,8 @@ struct ioengine_ops {
uint64_t, struct zbd_zone *, unsigned int);
int (*reset_wp)(struct thread_data *, struct fio_file *,
uint64_t, uint64_t);
+ int (*get_max_open_zones)(struct thread_data *, struct fio_file *,
+ unsigned int *);
int option_struct_size;
struct fio_option *options;
};
@@ -77,7 +81,8 @@ enum fio_ioengine_flags {
FIO_NOSTATS = 1 << 12, /* don't do IO stats */
FIO_NOFILEHASH = 1 << 13, /* doesn't hash the files for lookup later. */
FIO_ASYNCIO_SYNC_TRIM
- = 1 << 14 /* io engine has async ->queue except for trim */
+ = 1 << 14, /* io engine has async ->queue except for trim */
+ FIO_NO_OFFLOAD = 1 << 15, /* no async offload */
};
/*
diff --git a/iolog.c b/iolog.c
index fa40c857..26501b4a 100644
--- a/iolog.c
+++ b/iolog.c
@@ -151,7 +151,8 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u)
while (!flist_empty(&td->io_log_list)) {
int ret;
- if (td->o.read_iolog_chunked) {
+
+ if (!td->io_log_blktrace && td->o.read_iolog_chunked) {
if (td->io_log_checkmark == td->io_log_current) {
if (!read_iolog2(td))
return 1;
@@ -607,12 +608,11 @@ static int open_socket(const char *path)
/*
* open iolog, check version, and call appropriate parser
*/
-static bool init_iolog_read(struct thread_data *td)
+static bool init_iolog_read(struct thread_data *td, char *fname)
{
- char buffer[256], *p, *fname;
+ char buffer[256], *p;
FILE *f = NULL;
- fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);
dprint(FD_IO, "iolog: name=%s\n", fname);
if (is_socket(fname)) {
@@ -701,15 +701,19 @@ bool init_iolog(struct thread_data *td)
if (td->o.read_iolog_file) {
int need_swap;
+ char * fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);
/*
* Check if it's a blktrace file and load that if possible.
* Otherwise assume it's a normal log file and load that.
*/
- if (is_blktrace(td->o.read_iolog_file, &need_swap))
- ret = load_blktrace(td, td->o.read_iolog_file, need_swap);
- else
- ret = init_iolog_read(td);
+ if (is_blktrace(fname, &need_swap)) {
+ td->io_log_blktrace = 1;
+ ret = load_blktrace(td, fname, need_swap);
+ } else {
+ td->io_log_blktrace = 0;
+ ret = init_iolog_read(td, fname);
+ }
} else if (td->o.write_iolog_file)
ret = init_iolog_write(td);
else
diff --git a/iolog.h b/iolog.h
index 981081f9..9e382cc0 100644
--- a/iolog.h
+++ b/iolog.h
@@ -182,6 +182,7 @@ static inline struct io_sample *__get_sample(void *samples, int log_offset,
struct io_logs *iolog_cur_log(struct io_log *);
uint64_t iolog_nr_samples(struct io_log *);
void regrow_logs(struct thread_data *);
+void regrow_agg_logs(void);
static inline struct io_sample *get_sample(struct io_log *iolog,
struct io_logs *cur_log,
diff --git a/lib/gauss.c b/lib/gauss.c
index 3f84dbc6..c64f61e7 100644
--- a/lib/gauss.c
+++ b/lib/gauss.c
@@ -40,11 +40,11 @@ unsigned long long gauss_next(struct gauss_state *gs)
if (!gs->disable_hash)
sum = __hash_u64(sum);
- return sum % gs->nranges;
+ return (sum + gs->rand_off) % gs->nranges;
}
void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
- unsigned int seed)
+ double center, unsigned int seed)
{
memset(gs, 0, sizeof(*gs));
init_rand_seed(&gs->r, seed, 0);
@@ -55,6 +55,10 @@ void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
if (gs->stddev > nranges / 2)
gs->stddev = nranges / 2;
}
+ if (center == -1)
+ gs->rand_off = 0;
+ else
+ gs->rand_off = nranges * (center - 0.5);
}
void gauss_disable_hash(struct gauss_state *gs)
diff --git a/lib/gauss.h b/lib/gauss.h
index 478aa146..19e3a666 100644
--- a/lib/gauss.h
+++ b/lib/gauss.h
@@ -8,11 +8,12 @@ struct gauss_state {
struct frand_state r;
uint64_t nranges;
unsigned int stddev;
+ unsigned int rand_off;
bool disable_hash;
};
void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
- unsigned int seed);
+ double center, unsigned int seed);
unsigned long long gauss_next(struct gauss_state *gs);
void gauss_disable_hash(struct gauss_state *gs);
diff --git a/lib/num2str.c b/lib/num2str.c
index 726f1c44..cd89a0e5 100644
--- a/lib/num2str.c
+++ b/lib/num2str.c
@@ -7,8 +7,6 @@
#include "../oslib/asprintf.h"
#include "num2str.h"
-#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
-
/**
* num2str() - Cheesy number->string conversion, complete with carry rounding error.
* @num: quantity (e.g., number of blocks, bytes or bits)
@@ -38,7 +36,7 @@ char *num2str(uint64_t num, int maxlen, int base, int pow2, enum n2s_unit units)
char *buf;
compiletime_assert(sizeof(sistr) == sizeof(iecstr), "unit prefix arrays must be identical sizes");
- assert(units < ARRAY_SIZE(unitstr));
+ assert(units < FIO_ARRAY_SIZE(unitstr));
if (pow2)
unitprefix = iecstr;
@@ -69,7 +67,7 @@ char *num2str(uint64_t num, int maxlen, int base, int pow2, enum n2s_unit units)
* Divide by K/Ki until string length of num <= maxlen.
*/
modulo = -1U;
- while (post_index < ARRAY_SIZE(sistr)) {
+ while (post_index < FIO_ARRAY_SIZE(sistr)) {
sprintf(tmp, "%llu", (unsigned long long) num);
if (strlen(tmp) <= maxlen)
break;
@@ -80,7 +78,7 @@ char *num2str(uint64_t num, int maxlen, int base, int pow2, enum n2s_unit units)
post_index++;
}
- if (post_index >= ARRAY_SIZE(sistr))
+ if (post_index >= FIO_ARRAY_SIZE(sistr))
post_index = 0;
/*
@@ -112,6 +110,9 @@ done:
sprintf(tmp, "%.*f", (int)(maxlen - strlen(tmp) - 1),
(double)modulo / (double)thousand);
+ if (tmp[0] == '1')
+ num++;
+
if (asprintf(&buf, "%llu.%s%s%s", (unsigned long long) num, &tmp[2],
unitprefix[post_index], unitstr[units]) < 0)
buf = NULL;
diff --git a/lib/prio_tree.c b/lib/prio_tree.c
index d8e1b89a..c4f66a49 100644
--- a/lib/prio_tree.c
+++ b/lib/prio_tree.c
@@ -18,8 +18,6 @@
#include "../compiler/compiler.h"
#include "prio_tree.h"
-#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
-
/*
* A clever mix of heap and radix trees forms a radix priority search tree (PST)
* which is useful for storing intervals, e.g, we can consider a vma as a closed
@@ -57,9 +55,9 @@ static void fio_init prio_tree_init(void)
{
unsigned int i;
- for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
+ for (i = 0; i < FIO_ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
- index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
+ index_bits_to_maxindex[FIO_ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
}
/*
diff --git a/lib/rand.c b/lib/rand.c
index 5eb6e60a..e74da609 100644
--- a/lib/rand.c
+++ b/lib/rand.c
@@ -125,10 +125,7 @@ void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
uint64_t fill_random_buf(struct frand_state *fs, void *buf,
unsigned int len)
{
- uint64_t r = __rand(fs);
-
- if (sizeof(int) != sizeof(long *))
- r *= (unsigned long) __rand(fs);
+ uint64_t r = __get_next_seed(fs);
__fill_random_buf(buf, len, r);
return r;
@@ -188,10 +185,7 @@ uint64_t fill_random_buf_percentage(struct frand_state *fs, void *buf,
unsigned int segment, unsigned int len,
char *pattern, unsigned int pbytes)
{
- uint64_t r = __rand(fs);
-
- if (sizeof(int) != sizeof(long *))
- r *= (unsigned long) __rand(fs);
+ uint64_t r = __get_next_seed(fs);
__fill_random_buf_percentage(r, buf, percentage, segment, len,
pattern, pbytes);
diff --git a/lib/rand.h b/lib/rand.h
index 46c1c5e0..a8060045 100644
--- a/lib/rand.h
+++ b/lib/rand.h
@@ -150,6 +150,16 @@ static inline uint64_t rand_between(struct frand_state *state, uint64_t start,
return start + rand32_upto(state, end - start);
}
+static inline uint64_t __get_next_seed(struct frand_state *fs)
+{
+ uint64_t r = __rand(fs);
+
+ if (sizeof(int) != sizeof(long *))
+ r *= (unsigned long) __rand(fs);
+
+ return r;
+}
+
extern void init_rand(struct frand_state *, bool);
extern void init_rand_seed(struct frand_state *, uint64_t seed, bool);
extern void __fill_random_buf(void *buf, unsigned int len, uint64_t seed);
diff --git a/lib/zipf.c b/lib/zipf.c
index 321a4fb9..14d7928f 100644
--- a/lib/zipf.c
+++ b/lib/zipf.c
@@ -23,19 +23,21 @@ static void zipf_update(struct zipf_state *zs)
}
static void shared_rand_init(struct zipf_state *zs, uint64_t nranges,
- unsigned int seed)
+ double center, unsigned int seed)
{
memset(zs, 0, sizeof(*zs));
zs->nranges = nranges;
init_rand_seed(&zs->rand, seed, 0);
zs->rand_off = __rand(&zs->rand);
+ if (center != -1)
+ zs->rand_off = nranges * center;
}
void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta,
- unsigned int seed)
+ double center, unsigned int seed)
{
- shared_rand_init(zs, nranges, seed);
+ shared_rand_init(zs, nranges, center, seed);
zs->theta = theta;
zs->zeta2 = pow(1.0, zs->theta) + pow(0.5, zs->theta);
@@ -71,9 +73,9 @@ uint64_t zipf_next(struct zipf_state *zs)
}
void pareto_init(struct zipf_state *zs, uint64_t nranges, double h,
- unsigned int seed)
+ double center, unsigned int seed)
{
- shared_rand_init(zs, nranges, seed);
+ shared_rand_init(zs, nranges, center, seed);
zs->pareto_pow = log(h) / log(1.0 - h);
}
diff --git a/lib/zipf.h b/lib/zipf.h
index 16b65f57..332e3b2f 100644
--- a/lib/zipf.h
+++ b/lib/zipf.h
@@ -16,10 +16,12 @@ struct zipf_state {
bool disable_hash;
};
-void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta, unsigned int seed);
+void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta,
+ double center, unsigned int seed);
uint64_t zipf_next(struct zipf_state *zs);
-void pareto_init(struct zipf_state *zs, uint64_t nranges, double h, unsigned int seed);
+void pareto_init(struct zipf_state *zs, uint64_t nranges, double h,
+ double center, unsigned int seed);
uint64_t pareto_next(struct zipf_state *zs);
void zipf_disable_hash(struct zipf_state *zs);
diff --git a/libfio.c b/libfio.c
index 7348b164..6144a474 100644
--- a/libfio.c
+++ b/libfio.c
@@ -156,8 +156,13 @@ void reset_all_stats(struct thread_data *td)
void reset_fio_state(void)
{
+ int i;
+
groupid = 0;
thread_number = 0;
+ cur_segment = 0;
+ for (i = 0; i < nr_segments; i++)
+ segments[i].nr_threads = 0;
stat_number = 0;
done_secs = 0;
}
diff --git a/log.c b/log.c
index 6c36813d..562a29aa 100644
--- a/log.c
+++ b/log.c
@@ -42,6 +42,7 @@ size_t log_valist(const char *fmt, va_list args)
}
/* add prefix for the specified type in front of the valist */
+#ifdef FIO_INC_DEBUG
void log_prevalist(int type, const char *fmt, va_list args)
{
char *buf1, *buf2;
@@ -64,6 +65,7 @@ void log_prevalist(int type, const char *fmt, va_list args)
len = log_info_buf(buf2, len);
free(buf2);
}
+#endif
ssize_t log_info(const char *format, ...)
{
diff --git a/optgroup.c b/optgroup.c
index c228ff29..bebb4a51 100644
--- a/optgroup.c
+++ b/optgroup.c
@@ -142,6 +142,10 @@ static const struct opt_group fio_opt_cat_groups[] = {
.mask = FIO_OPT_G_RDMA,
},
{
+ .name = "librpma I/O engines", /* librpma_apm && librpma_gpspm */
+ .mask = FIO_OPT_G_LIBRPMA,
+ },
+ {
.name = "libaio I/O engine", /* libaio */
.mask = FIO_OPT_G_LIBAIO,
},
@@ -174,6 +178,18 @@ static const struct opt_group fio_opt_cat_groups[] = {
.mask = FIO_OPT_G_NBD,
},
{
+ .name = "libcufile I/O engine", /* libcufile */
+ .mask = FIO_OPT_G_LIBCUFILE,
+ },
+ {
+ .name = "DAOS File System (dfs) I/O engine", /* dfs */
+ .mask = FIO_OPT_G_DFS,
+ },
+ {
+ .name = "NFS I/O engine", /* nfs */
+ .mask = FIO_OPT_G_NFS,
+ },
+ {
.name = NULL,
},
};
diff --git a/optgroup.h b/optgroup.h
index 5789afd3..1fb84a29 100644
--- a/optgroup.h
+++ b/optgroup.h
@@ -52,6 +52,7 @@ enum opt_category_group {
__FIO_OPT_G_E4DEFRAG,
__FIO_OPT_G_NETIO,
__FIO_OPT_G_RDMA,
+ __FIO_OPT_G_LIBRPMA,
__FIO_OPT_G_LIBAIO,
__FIO_OPT_G_ACT,
__FIO_OPT_G_LATPROF,
@@ -67,6 +68,9 @@ enum opt_category_group {
__FIO_OPT_G_IOURING,
__FIO_OPT_G_FILESTAT,
__FIO_OPT_G_NR,
+ __FIO_OPT_G_LIBCUFILE,
+ __FIO_OPT_G_DFS,
+ __FIO_OPT_G_NFS,
FIO_OPT_G_RATE = (1ULL << __FIO_OPT_G_RATE),
FIO_OPT_G_ZONE = (1ULL << __FIO_OPT_G_ZONE),
@@ -93,6 +97,7 @@ enum opt_category_group {
FIO_OPT_G_E4DEFRAG = (1ULL << __FIO_OPT_G_E4DEFRAG),
FIO_OPT_G_NETIO = (1ULL << __FIO_OPT_G_NETIO),
FIO_OPT_G_RDMA = (1ULL << __FIO_OPT_G_RDMA),
+ FIO_OPT_G_LIBRPMA = (1ULL << __FIO_OPT_G_LIBRPMA),
FIO_OPT_G_LIBAIO = (1ULL << __FIO_OPT_G_LIBAIO),
FIO_OPT_G_ACT = (1ULL << __FIO_OPT_G_ACT),
FIO_OPT_G_LATPROF = (1ULL << __FIO_OPT_G_LATPROF),
@@ -106,8 +111,11 @@ enum opt_category_group {
FIO_OPT_G_INVALID = (1ULL << __FIO_OPT_G_NR),
FIO_OPT_G_ISCSI = (1ULL << __FIO_OPT_G_ISCSI),
FIO_OPT_G_NBD = (1ULL << __FIO_OPT_G_NBD),
+ FIO_OPT_G_NFS = (1ULL << __FIO_OPT_G_NFS),
FIO_OPT_G_IOURING = (1ULL << __FIO_OPT_G_IOURING),
FIO_OPT_G_FILESTAT = (1ULL << __FIO_OPT_G_FILESTAT),
+ FIO_OPT_G_LIBCUFILE = (1ULL << __FIO_OPT_G_LIBCUFILE),
+ FIO_OPT_G_DFS = (1ULL << __FIO_OPT_G_DFS),
};
extern const struct opt_group *opt_group_from_mask(uint64_t *mask);
diff --git a/options.c b/options.c
index b497d973..8c2ab7cc 100644
--- a/options.c
+++ b/options.c
@@ -22,7 +22,7 @@ char client_sockaddr_str[INET6_ADDRSTRLEN] = { 0 };
static const struct pattern_fmt_desc fmt_desc[] = {
{
.fmt = "%o",
- .len = FIELD_SIZE(struct io_u *, offset),
+ .len = FIO_FIELD_SIZE(struct io_u *, offset),
.paste = paste_blockoff
},
{ }
@@ -44,6 +44,27 @@ static char *get_opt_postfix(const char *str)
return strdup(p);
}
+static bool split_parse_distr(const char *str, double *val, double *center)
+{
+ char *cp, *p;
+ bool r;
+
+ p = strdup(str);
+ if (!p)
+ return false;
+
+ cp = strstr(p, ":");
+ r = true;
+ if (cp) {
+ *cp = '\0';
+ cp++;
+ r = str_to_float(cp, center, 0);
+ }
+ r = r && str_to_float(p, val, 0);
+ free(p);
+ return r;
+}
+
static int bs_cmp(const void *p1, const void *p2)
{
const struct bssplit *bsp1 = p1;
@@ -787,6 +808,7 @@ static int str_fst_cb(void *data, const char *str)
{
struct thread_data *td = cb_data_to_td(data);
double val;
+ double center = -1;
bool done = false;
char *nr;
@@ -821,7 +843,7 @@ static int str_fst_cb(void *data, const char *str)
return 0;
nr = get_opt_postfix(str);
- if (nr && !str_to_float(nr, &val, 0)) {
+ if (nr && !split_parse_distr(nr, &val, &center)) {
log_err("fio: file service type random postfix parsing failed\n");
free(nr);
return 1;
@@ -829,6 +851,12 @@ static int str_fst_cb(void *data, const char *str)
free(nr);
+ if (center != -1 && (center < 0.00 || center > 1.00)) {
+ log_err("fio: distribution center out of range (0 <= center <= 1.0)\n");
+ return 1;
+ }
+ td->random_center = center;
+
switch (td->o.file_service_type) {
case FIO_FSERVICE_ZIPF:
if (val == 1.00) {
@@ -1030,6 +1058,7 @@ static int str_random_distribution_cb(void *data, const char *str)
{
struct thread_data *td = cb_data_to_td(data);
double val;
+ double center = -1;
char *nr;
if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
@@ -1046,7 +1075,7 @@ static int str_random_distribution_cb(void *data, const char *str)
return 0;
nr = get_opt_postfix(str);
- if (nr && !str_to_float(nr, &val, 0)) {
+ if (nr && !split_parse_distr(nr, &val, &center)) {
log_err("fio: random postfix parsing failed\n");
free(nr);
return 1;
@@ -1054,6 +1083,12 @@ static int str_random_distribution_cb(void *data, const char *str)
free(nr);
+ if (center != -1 && (center < 0.00 || center > 1.00)) {
+ log_err("fio: distribution center out of range (0 <= center <= 1.0)\n");
+ return 1;
+ }
+ td->o.random_center.u.f = center;
+
if (td->o.random_distribution == FIO_RAND_DIST_ZIPF) {
if (val == 1.00) {
log_err("fio: zipf theta must different than 1.0\n");
@@ -1387,7 +1422,7 @@ static int str_verify_pattern_cb(void *data, const char *input)
struct thread_data *td = cb_data_to_td(data);
int ret;
- td->o.verify_fmt_sz = ARRAY_SIZE(td->o.verify_fmt);
+ td->o.verify_fmt_sz = FIO_ARRAY_SIZE(td->o.verify_fmt);
ret = parse_and_fill_pattern(input, strlen(input), td->o.verify_pattern,
MAX_PATTERN_SIZE, fmt_desc,
td->o.verify_fmt, &td->o.verify_fmt_sz);
@@ -1436,8 +1471,13 @@ static int str_offset_cb(void *data, unsigned long long *__val)
if (parse_is_percent(v)) {
td->o.start_offset = 0;
td->o.start_offset_percent = -1ULL - v;
+ td->o.start_offset_nz = 0;
dprint(FD_PARSE, "SET start_offset_percent %d\n",
td->o.start_offset_percent);
+ } else if (parse_is_zone(v)) {
+ td->o.start_offset = 0;
+ td->o.start_offset_percent = 0;
+ td->o.start_offset_nz = v - ZONE_BASE_VAL;
} else
td->o.start_offset = v;
@@ -1452,8 +1492,13 @@ static int str_offset_increment_cb(void *data, unsigned long long *__val)
if (parse_is_percent(v)) {
td->o.offset_increment = 0;
td->o.offset_increment_percent = -1ULL - v;
+ td->o.offset_increment_nz = 0;
dprint(FD_PARSE, "SET offset_increment_percent %d\n",
td->o.offset_increment_percent);
+ } else if (parse_is_zone(v)) {
+ td->o.offset_increment = 0;
+ td->o.offset_increment_percent = 0;
+ td->o.offset_increment_nz = v - ZONE_BASE_VAL;
} else
td->o.offset_increment = v;
@@ -1470,6 +1515,10 @@ static int str_size_cb(void *data, unsigned long long *__val)
td->o.size_percent = -1ULL - v;
dprint(FD_PARSE, "SET size_percent %d\n",
td->o.size_percent);
+ } else if (parse_is_zone(v)) {
+ td->o.size = 0;
+ td->o.size_percent = 0;
+ td->o.size_nz = v - ZONE_BASE_VAL;
} else
td->o.size = v;
@@ -1490,12 +1539,30 @@ static int str_io_size_cb(void *data, unsigned long long *__val)
}
dprint(FD_PARSE, "SET io_size_percent %d\n",
td->o.io_size_percent);
+ } else if (parse_is_zone(v)) {
+ td->o.io_size = 0;
+ td->o.io_size_percent = 0;
+ td->o.io_size_nz = v - ZONE_BASE_VAL;
} else
td->o.io_size = v;
return 0;
}
+static int str_zoneskip_cb(void *data, unsigned long long *__val)
+{
+ struct thread_data *td = cb_data_to_td(data);
+ unsigned long long v = *__val;
+
+ if (parse_is_zone(v)) {
+ td->o.zone_skip = 0;
+ td->o.zone_skip_nz = v - ZONE_BASE_VAL;
+ } else
+ td->o.zone_skip = v;
+
+ return 0;
+}
+
static int str_write_bw_log_cb(void *data, const char *str)
{
struct thread_data *td = cb_data_to_td(data);
@@ -1637,6 +1704,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.lname = "Filename(s)",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct thread_options, filename),
+ .maxlen = PATH_MAX,
.cb = str_filename_cb,
.prio = -1, /* must come after "directory" */
.help = "File(s) to use for the workload",
@@ -1877,6 +1945,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.help = "RDMA IO engine",
},
#endif
+#ifdef CONFIG_LIBRPMA_APM
+ { .ival = "librpma_apm",
+ .help = "librpma IO engine in APM mode",
+ },
+#endif
+#ifdef CONFIG_LIBRPMA_GPSPM
+ { .ival = "librpma_gpspm",
+ .help = "librpma IO engine in GPSPM mode",
+ },
+#endif
#ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT
{ .ival = "e4defrag",
.help = "ext4 defrag engine",
@@ -1943,6 +2021,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
{ .ival = "nbd",
.help = "Network Block Device (NBD) IO engine"
},
+#ifdef CONFIG_DFS
+ { .ival = "dfs",
+ .help = "DAOS File System (dfs) IO engine",
+ },
+#endif
+#ifdef CONFIG_NFS
+ { .ival = "nfs",
+ .help = "NFS IO engine",
+ },
+#endif
},
},
{
@@ -2045,11 +2133,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
{
.name = "size",
.lname = "Size",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
.cb = str_size_cb,
.off1 = offsetof(struct thread_options, size),
.help = "Total size of device or files",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
@@ -2057,11 +2144,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.name = "io_size",
.alias = "io_limit",
.lname = "IO Size",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
.cb = str_io_size_cb,
.off1 = offsetof(struct thread_options, io_size),
.help = "Total size of I/O to be performed",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
@@ -2102,12 +2188,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.name = "offset",
.lname = "IO offset",
.alias = "fileoffset",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
.cb = str_offset_cb,
.off1 = offsetof(struct thread_options, start_offset),
.help = "Start IO from this offset",
.def = "0",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
@@ -2125,14 +2210,13 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
{
.name = "offset_increment",
.lname = "IO offset increment",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
.cb = str_offset_increment_cb,
.off1 = offsetof(struct thread_options, offset_increment),
.help = "What is the increment from one offset to the next",
.parent = "offset",
.hide = 1,
.def = "0",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
@@ -3368,11 +3452,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
{
.name = "zoneskip",
.lname = "Zone skip",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
+ .cb = str_zoneskip_cb,
.off1 = offsetof(struct thread_options, zone_skip),
.help = "Space between IO zones",
.def = "0",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_ZONE,
},
@@ -3409,6 +3493,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.group = FIO_OPT_G_INVALID,
},
{
+ .name = "ignore_zone_limits",
+ .lname = "Ignore zone resource limits",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct thread_options, ignore_zone_limits),
+ .def = "0",
+ .help = "Ignore the zone resource limits (max open/active zones) reported by the device",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
.name = "zone_reset_threshold",
.lname = "Zone reset threshold",
.help = "Zoned block device reset threshold",
@@ -3573,6 +3667,28 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.group = FIO_OPT_G_THINKTIME,
},
{
+ .name = "thinktime_blocks_type",
+ .lname = "Thinktime blocks type",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct thread_options, thinktime_blocks_type),
+ .help = "How thinktime_blocks takes effect",
+ .def = "complete",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_THINKTIME,
+ .posval = {
+ { .ival = "complete",
+ .oval = THINKTIME_BLOCKS_TYPE_COMPLETE,
+ .help = "thinktime_blocks takes effect at the completion side",
+ },
+ {
+ .ival = "issue",
+ .oval = THINKTIME_BLOCKS_TYPE_ISSUE,
+ .help = "thinktime_blocks takes effect at the issue side",
+ },
+ },
+ .parent = "thinktime",
+ },
+ {
.name = "rate",
.lname = "I/O rate",
.type = FIO_OPT_ULL,
@@ -3670,8 +3786,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
{
.name = "max_latency",
.lname = "Max Latency (usec)",
- .type = FIO_OPT_STR_VAL_TIME,
- .off1 = offsetof(struct thread_options, max_latency),
+ .type = FIO_OPT_ULL,
+ .off1 = offsetof(struct thread_options, max_latency[DDIR_READ]),
+ .off2 = offsetof(struct thread_options, max_latency[DDIR_WRITE]),
+ .off3 = offsetof(struct thread_options, max_latency[DDIR_TRIM]),
.help = "Maximum tolerated IO latency (usec)",
.is_time = 1,
.category = FIO_OPT_C_IO,
@@ -3733,14 +3851,32 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
{
.name = "sync",
.lname = "Synchronous I/O",
- .type = FIO_OPT_BOOL,
+ .type = FIO_OPT_STR,
.off1 = offsetof(struct thread_options, sync_io),
- .help = "Use O_SYNC for buffered writes",
- .def = "0",
- .parent = "buffered",
+ .help = "Use synchronous write IO",
+ .def = "none",
.hide = 1,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_IO_TYPE,
+ .posval = {
+ { .ival = "none",
+ .oval = 0,
+ },
+ { .ival = "0",
+ .oval = 0,
+ },
+ { .ival = "sync",
+ .oval = O_SYNC,
+ },
+ { .ival = "1",
+ .oval = O_SYNC,
+ },
+#ifdef O_DSYNC
+ { .ival = "dsync",
+ .oval = O_DSYNC,
+ },
+#endif
+ },
},
#ifdef FIO_HAVE_WRITE_HINT
{
@@ -4362,6 +4498,40 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.group = FIO_OPT_G_IO_BUF,
},
{
+ .name = "dedupe_mode",
+ .lname = "Dedupe mode",
+ .help = "Mode for the deduplication buffer generation",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct thread_options, dedupe_mode),
+ .parent = "dedupe_percentage",
+ .def = "repeat",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_IO_BUF,
+ .posval = {
+ { .ival = "repeat",
+ .oval = DEDUPE_MODE_REPEAT,
+ .help = "repeat previous page",
+ },
+ { .ival = "working_set",
+ .oval = DEDUPE_MODE_WORKING_SET,
+ .help = "choose a page randomly from limited working set defined in dedupe_working_set_percentage",
+ },
+ },
+ },
+ {
+ .name = "dedupe_working_set_percentage",
+ .lname = "Dedupe working set percentage",
+ .help = "Dedupe working set size in percentages from file or device size used to generate dedupe patterns from",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, dedupe_working_set_percentage),
+ .parent = "dedupe_percentage",
+ .def = "5",
+ .maxval = 100,
+ .minval = 0,
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_IO_BUF,
+ },
+ {
.name = "clat_percentiles",
.lname = "Completion latency percentiles",
.type = FIO_OPT_BOOL,
@@ -4512,12 +4682,39 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
{
.name = "unified_rw_reporting",
.lname = "Unified RW Reporting",
- .type = FIO_OPT_BOOL,
+ .type = FIO_OPT_STR,
.off1 = offsetof(struct thread_options, unified_rw_rep),
.help = "Unify reporting across data direction",
- .def = "0",
+ .def = "none",
.category = FIO_OPT_C_GENERAL,
.group = FIO_OPT_G_INVALID,
+ .posval = {
+ { .ival = "none",
+ .oval = UNIFIED_SPLIT,
+ .help = "Normal statistics reporting",
+ },
+ { .ival = "mixed",
+ .oval = UNIFIED_MIXED,
+ .help = "Statistics are summed per data direction and reported together",
+ },
+ { .ival = "both",
+ .oval = UNIFIED_BOTH,
+ .help = "Statistics are reported normally, followed by the mixed statistics"
+ },
+ /* Compatibility with former boolean values */
+ { .ival = "0",
+ .oval = UNIFIED_SPLIT,
+ .help = "Alias for 'none'",
+ },
+ { .ival = "1",
+ .oval = UNIFIED_MIXED,
+ .help = "Alias for 'mixed'",
+ },
+ { .ival = "2",
+ .oval = UNIFIED_BOTH,
+ .help = "Alias for 'both'",
+ },
+ },
},
{
.name = "continue_on_error",
@@ -5046,7 +5243,7 @@ static char *fio_keyword_replace(char *opt)
struct fio_keyword *kw = &fio_keywords[i];
while ((s = strstr(opt, kw->word)) != NULL) {
- char *new = malloc(strlen(opt) + 1);
+ char *new = calloc(strlen(opt) + 1, 1);
char *o_org = opt;
int olen = s - opt;
int len;
@@ -5062,9 +5259,10 @@ static char *fio_keyword_replace(char *opt)
* If there's more in the original string, copy that
* in too
*/
- opt += strlen(kw->word) + olen;
+ opt += olen + strlen(kw->word);
+ /* keeps final zero thanks to calloc */
if (strlen(opt))
- memcpy(new + olen + len, opt, opt - o_org - 1);
+ memcpy(new + olen + len, opt, strlen(opt));
/*
* replace opt and free the old opt
@@ -5349,6 +5547,19 @@ void fio_options_free(struct thread_data *td)
}
}
+void fio_dump_options_free(struct thread_data *td)
+{
+ while (!flist_empty(&td->opt_list)) {
+ struct print_option *p;
+
+ p = flist_first_entry(&td->opt_list, struct print_option, list);
+ flist_del_init(&p->list);
+ free(p->name);
+ free(p->value);
+ free(p);
+ }
+}
+
struct fio_option *fio_option_find(const char *name)
{
return find_option(fio_options, name);
diff --git a/options.h b/options.h
index 5276f31e..df80fd98 100644
--- a/options.h
+++ b/options.h
@@ -16,6 +16,7 @@ void add_opt_posval(const char *, const char *, const char *);
void del_opt_posval(const char *, const char *);
struct thread_data;
void fio_options_free(struct thread_data *);
+void fio_dump_options_free(struct thread_data *);
char *get_next_str(char **ptr);
int get_max_str_idx(char *input);
char* get_name_by_idx(char *input, int index);
diff --git a/os/os-aix.h b/os/os-aix.h
index 1aab96e0..db99eef4 100644
--- a/os/os-aix.h
+++ b/os/os-aix.h
@@ -18,6 +18,12 @@
#define FIO_USE_GENERIC_SWAP
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int blockdev_invalidate_cache(struct fio_file *f)
{
return ENOTSUP;
diff --git a/os/os-android.h b/os/os-android.h
index 3c050776..a81cd815 100644
--- a/os/os-android.h
+++ b/os/os-android.h
@@ -58,6 +58,12 @@
#define MAP_HUGETLB 0x40000 /* arch specific */
#endif
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
#ifndef CONFIG_NO_SHM
/*
* Bionic doesn't support SysV shared memeory, so implement it using ashmem
@@ -65,11 +71,15 @@
#include <stdio.h>
#include <linux/ashmem.h>
#include <linux/shm.h>
+#include <android/api-level.h>
+#if __ANDROID_API__ >= __ANDROID_API_O__
+#include <android/sharedmem.h>
+#else
+#define ASHMEM_DEVICE "/dev/ashmem"
+#endif
#define shmid_ds shmid64_ds
#define SHM_HUGETLB 04000
-#define ASHMEM_DEVICE "/dev/ashmem"
-
static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
{
int ret=0;
@@ -83,6 +93,16 @@ static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
return ret;
}
+#if __ANDROID_API__ >= __ANDROID_API_O__
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
+{
+ char keybuf[11];
+
+ sprintf(keybuf, "%d", __key);
+
+ return ASharedMemory_create(keybuf, __size + sizeof(uint64_t));
+}
+#else
static inline int shmget(key_t __key, size_t __size, int __shmflg)
{
int fd,ret;
@@ -108,6 +128,7 @@ error:
close(fd);
return ret;
}
+#endif
static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
{
diff --git a/os/os-dragonfly.h b/os/os-dragonfly.h
index 44bfcd5d..6e465894 100644
--- a/os/os-dragonfly.h
+++ b/os/os-dragonfly.h
@@ -92,6 +92,12 @@ typedef cpumask_t os_cpu_mask_t;
/* No CPU_COUNT(), but use the default function defined in os/os.h */
#define fio_cpu_count(mask) CPU_COUNT((mask))
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int fio_cpuset_init(os_cpu_mask_t *mask)
{
CPUMASK_ASSZERO(*mask);
diff --git a/os/os-freebsd.h b/os/os-freebsd.h
index b3addf98..1b24fa02 100644
--- a/os/os-freebsd.h
+++ b/os/os-freebsd.h
@@ -37,6 +37,12 @@ typedef cpuset_t os_cpu_mask_t;
#define fio_cpu_isset(mask, cpu) (CPU_ISSET((cpu), (mask)) != 0)
#define fio_cpu_count(mask) CPU_COUNT((mask))
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int fio_cpuset_init(os_cpu_mask_t *mask)
{
CPU_ZERO(mask);
diff --git a/os/os-hpux.h b/os/os-hpux.h
index c1dafe42..a80cb2bc 100644
--- a/os/os-hpux.h
+++ b/os/os-hpux.h
@@ -38,6 +38,13 @@
#define FIO_USE_GENERIC_SWAP
#define FIO_OS_HAVE_AIOCB_TYPEDEF
+
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
typedef struct aiocb64 os_aiocb_t;
static inline int blockdev_invalidate_cache(struct fio_file *f)
diff --git a/os/os-linux.h b/os/os-linux.h
index 65d3b429..16ed5258 100644
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -14,7 +14,6 @@
#include <errno.h>
#include <sched.h>
#include <linux/unistd.h>
-#include <linux/raw.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <scsi/sg.h>
@@ -41,7 +40,6 @@
#define FIO_HAVE_IOSCHED_SWITCH
#define FIO_HAVE_ODIRECT
#define FIO_HAVE_HUGETLB
-#define FIO_HAVE_RAWBIND
#define FIO_HAVE_BLKTRACE
#define FIO_HAVE_CL_SIZE
#define FIO_HAVE_CGROUPS
@@ -58,7 +56,7 @@
#define OS_MAP_ANON MAP_ANONYMOUS
-#define FIO_EXT_ENG_DIR "/usr/lib/fio"
+#define FIO_EXT_ENG_DIR "/usr/local/lib/fio"
typedef cpu_set_t os_cpu_mask_t;
@@ -74,6 +72,12 @@ typedef cpu_set_t os_cpu_mask_t;
sched_getaffinity((pid), (ptr))
#endif
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
#define fio_cpu_clear(mask, cpu) (void) CPU_CLR((cpu), (mask))
#define fio_cpu_set(mask, cpu) (void) CPU_SET((cpu), (mask))
#define fio_cpu_isset(mask, cpu) (CPU_ISSET((cpu), (mask)) != 0)
@@ -172,36 +176,6 @@ static inline unsigned long long os_phys_mem(void)
return (unsigned long long) pages * (unsigned long long) pagesize;
}
-static inline int fio_lookup_raw(dev_t dev, int *majdev, int *mindev)
-{
- struct raw_config_request rq;
- int fd;
-
- if (major(dev) != RAW_MAJOR)
- return 1;
-
- /*
- * we should be able to find /dev/rawctl or /dev/raw/rawctl
- */
- fd = open("/dev/rawctl", O_RDONLY);
- if (fd < 0) {
- fd = open("/dev/raw/rawctl", O_RDONLY);
- if (fd < 0)
- return 1;
- }
-
- rq.raw_minor = minor(dev);
- if (ioctl(fd, RAW_GETBIND, &rq) < 0) {
- close(fd);
- return 1;
- }
-
- close(fd);
- *majdev = rq.block_major;
- *mindev = rq.block_minor;
- return 0;
-}
-
#ifdef O_NOATIME
#define FIO_O_NOATIME O_NOATIME
#else
diff --git a/os/os-mac.h b/os/os-mac.h
index 2852ac67..ec2cc1e5 100644
--- a/os/os-mac.h
+++ b/os/os-mac.h
@@ -27,11 +27,11 @@
#define fio_swap32(x) OSSwapInt32(x)
#define fio_swap64(x) OSSwapInt64(x)
-/*
- * OSX has a pitifully small shared memory segment by default,
- * so default to a lower number of max jobs supported
- */
-#define FIO_MAX_JOBS 128
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
#ifndef CONFIG_CLOCKID_T
typedef unsigned int clockid_t;
diff --git a/os/os-netbsd.h b/os/os-netbsd.h
index abc1d3cb..624c7fa5 100644
--- a/os/os-netbsd.h
+++ b/os/os-netbsd.h
@@ -35,6 +35,12 @@
#define fio_swap32(x) bswap32(x)
#define fio_swap64(x) bswap64(x)
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
{
struct disklabel dl;
diff --git a/os/os-openbsd.h b/os/os-openbsd.h
index 994bf078..f1bad671 100644
--- a/os/os-openbsd.h
+++ b/os/os-openbsd.h
@@ -35,6 +35,12 @@
#define fio_swap32(x) swap32(x)
#define fio_swap64(x) swap64(x)
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
{
struct disklabel dl;
diff --git a/os/os-solaris.h b/os/os-solaris.h
index f1966f44..ea1f081c 100644
--- a/os/os-solaris.h
+++ b/os/os-solaris.h
@@ -46,6 +46,12 @@ struct solaris_rand_seed {
#define os_ctime_r(x, y, z) ctime_r((x), (y), (z))
#define FIO_OS_HAS_CTIME_R
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
typedef psetid_t os_cpu_mask_t;
static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
diff --git a/os/os-windows-xp.h b/os/os-windows-xp.h
deleted file mode 100644
index fbc23e2c..00000000
--- a/os/os-windows-xp.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#define FIO_MAX_CPUS MAXIMUM_PROCESSORS
-
-typedef DWORD_PTR os_cpu_mask_t;
diff --git a/os/os-windows.h b/os/os-windows.h
index fa2955f9..59da9dba 100644
--- a/os/os-windows.h
+++ b/os/os-windows.h
@@ -21,6 +21,7 @@
#include "../lib/types.h"
#include "windows/posix.h"
+#include "os-windows-7.h"
#ifndef PTHREAD_STACK_MIN
#define PTHREAD_STACK_MIN 65535
@@ -76,6 +77,7 @@
#define SIGCONT 0
#define SIGUSR1 1
#define SIGUSR2 2
+#define SIGKILL 15 /* SIGKILL doesn't exists, let's use SIGTERM */
typedef int sigset_t;
typedef int siginfo_t;
@@ -215,13 +217,8 @@ static inline int fio_mkdir(const char *path, mode_t mode) {
return 0;
}
-#ifdef CONFIG_WINDOWS_XP
-#include "os-windows-xp.h"
-#else
#define FIO_HAVE_CPU_ONLINE_SYSCONF
unsigned int cpus_online(void);
-#include "os-windows-7.h"
-#endif
int first_set_cpu(os_cpu_mask_t *cpumask);
int fio_setaffinity(int pid, os_cpu_mask_t cpumask);
diff --git a/os/os.h b/os/os.h
index 9a280e54..17daf91d 100644
--- a/os/os.h
+++ b/os/os.h
@@ -7,6 +7,7 @@
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
+#include <errno.h>
#include "../arch/arch.h" /* IWYU pragma: export */
#include "../lib/types.h"
@@ -58,6 +59,10 @@ typedef enum {
#error "unsupported os"
#endif
+#ifndef EDQUOT
+#define EDQUOT EIO
+#endif
+
#ifdef CONFIG_POSIXAIO
#include <aio.h>
#ifndef FIO_OS_HAVE_AIOCB_TYPEDEF
@@ -152,10 +157,6 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
#define OS_RAND_MAX RAND_MAX
#endif
-#ifndef FIO_HAVE_RAWBIND
-#define fio_lookup_raw(dev, majdev, mindev) 1
-#endif
-
#ifndef FIO_PREFERRED_ENGINE
#define FIO_PREFERRED_ENGINE "psync"
#endif
@@ -172,10 +173,6 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
#endif
#endif
-#ifndef FIO_MAX_JOBS
-#define FIO_MAX_JOBS 4096
-#endif
-
#ifndef CONFIG_SOCKLEN_T
typedef unsigned int socklen_t;
#endif
diff --git a/os/windows/WixUI_Minimal_NoEULA.wxs b/os/windows/WixUI_Minimal_NoEULA.wxs
new file mode 100755
index 00000000..48391186
--- /dev/null
+++ b/os/windows/WixUI_Minimal_NoEULA.wxs
@@ -0,0 +1,96 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Copyright (c) .NET Foundation and contributors. All rights reserved. Licensed under the Microsoft Reciprocal License. See LICENSE.TXT file in the project root for full license information. -->
+
+
+
+<!--
+First-time install dialog sequence:
+ - WixUI_MyWelcomeDlg
+Maintenance dialog sequence:
+ WixUI_MaintenanceWelcomeDlg
+ - WixUI_MaintenanceTypeDlg
+ - WixUI_VerifyReadyDlg
+-->
+
+<Wix xmlns="http://schemas.microsoft.com/wix/2006/wi">
+ <Fragment>
+ <UI Id="WixUI_Minimal_NoEULA">
+ <TextStyle Id="WixUI_Font_Normal" FaceName="Tahoma" Size="8" />
+ <TextStyle Id="WixUI_Font_Bigger" FaceName="Tahoma" Size="12" />
+ <TextStyle Id="WixUI_Font_Title" FaceName="Tahoma" Size="9" Bold="yes" />
+
+ <Property Id="DefaultUIFont" Value="WixUI_Font_Normal" />
+ <Property Id="WixUI_Mode" Value="Minimal" />
+
+ <DialogRef Id="ErrorDlg" />
+ <DialogRef Id="FatalError" />
+ <DialogRef Id="FilesInUse" />
+ <DialogRef Id="MsiRMFilesInUse" />
+ <DialogRef Id="PrepareDlg" />
+ <DialogRef Id="ProgressDlg" />
+ <DialogRef Id="ResumeDlg" />
+ <DialogRef Id="UserExit" />
+ <DialogRef Id="MyWelcomeDlg" />
+
+ <Dialog Id="MyWelcomeDlg" Width="370" Height="270" Title="!(loc.WelcomeDlg_Title)">
+ <Control Id="Install" Type="PushButton" ElevationShield="yes" X="236" Y="243" Width="56" Height="17" Default="yes" Hidden="yes" Text="!(loc.WelcomeEulaDlgInstall)" >
+ <Publish Property="WixUI_InstallMode" Value="Update">Installed AND PATCH</Publish>
+ <Publish Event="SpawnWaitDialog" Value="WaitForCostingDlg">!(wix.WixUICostingPopupOptOut) OR CostingComplete = 1</Publish>
+ <Publish Event="EndDialog" Value="Return"><![CDATA[OutOfDiskSpace <> 1]]></Publish>
+ <Publish Event="SpawnDialog" Value="OutOfRbDiskDlg">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND (PROMPTROLLBACKCOST="P" OR NOT PROMPTROLLBACKCOST)</Publish>
+ <Publish Event="EndDialog" Value="Return">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>
+ <Publish Event="EnableRollback" Value="False">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>
+ <Publish Event="SpawnDialog" Value="OutOfDiskDlg">(OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 1) OR (OutOfDiskSpace = 1 AND PROMPTROLLBACKCOST="F")</Publish>
+ <Condition Action="show">ALLUSERS</Condition>
+ </Control>
+ <Control Id="InstallNoShield" Type="PushButton" ElevationShield="no" X="212" Y="243" Width="80" Height="17" Default="yes" Text="!(loc.WelcomeEulaDlgInstall)" Hidden="yes">
+ <Publish Event="SpawnWaitDialog" Value="WaitForCostingDlg">!(wix.WixUICostingPopupOptOut) OR CostingComplete = 1</Publish>
+ <Publish Event="EndDialog" Value="Return"><![CDATA[OutOfDiskSpace <> 1]]></Publish>
+ <Publish Event="SpawnDialog" Value="OutOfRbDiskDlg">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND (PROMPTROLLBACKCOST="P" OR NOT PROMPTROLLBACKCOST)</Publish>
+ <Publish Event="EndDialog" Value="Return">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>
+ <Publish Event="EnableRollback" Value="False">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>
+ <Publish Event="SpawnDialog" Value="OutOfDiskDlg">(OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 1) OR (OutOfDiskSpace = 1 AND PROMPTROLLBACKCOST="F")</Publish>
+ <Condition Action="disable"><![CDATA[LicenseAccepted <> "1"]]></Condition>
+ <Condition Action="show">NOT ALLUSERS</Condition>
+ </Control>
+ <Control Id="Cancel" Type="PushButton" X="304" Y="243" Width="56" Height="17" Cancel="yes" Text="!(loc.WixUICancel)">
+ <Publish Event="SpawnDialog" Value="CancelDlg">1</Publish>
+ </Control>
+ <Control Id="Bitmap" Type="Bitmap" X="0" Y="0" Width="370" Height="234" TabSkip="no" Text="!(loc.WelcomeDlgBitmap)" />
+ <Control Id="Back" Type="PushButton" X="180" Y="243" Width="56" Height="17" Disabled="yes" Text="!(loc.WixUIBack)" />
+ <Control Id="BottomLine" Type="Line" X="0" Y="234" Width="370" Height="0" />
+ <Control Id="Description" Type="Text" X="135" Y="80" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.MyWelcomeDlgDescription)" >
+ <Condition Action="show">NOT Installed OR NOT PATCH</Condition>
+ <Condition Action="hide">Installed AND PATCH</Condition>
+ </Control>
+ <Control Id="PatchDescription" Type="Text" X="135" Y="80" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.WelcomeUpdateDlgDescriptionUpdate)" >
+ <Condition Action="show">Installed AND PATCH</Condition>
+ <Condition Action="hide">NOT Installed OR NOT PATCH</Condition>
+ </Control>
+ <Control Id="Title" Type="Text" X="135" Y="20" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.WelcomeDlgTitle)" />
+ </Dialog>
+
+ <Publish Dialog="ExitDialog" Control="Finish" Event="EndDialog" Value="Return" Order="999">1</Publish>
+
+ <Publish Dialog="VerifyReadyDlg" Control="Back" Event="NewDialog" Value="MaintenanceTypeDlg">1</Publish>
+
+ <Publish Dialog="MaintenanceWelcomeDlg" Control="Next" Event="NewDialog" Value="MaintenanceTypeDlg">1</Publish>
+
+ <Publish Dialog="MaintenanceTypeDlg" Control="RepairButton" Event="NewDialog" Value="VerifyReadyDlg">1</Publish>
+ <Publish Dialog="MaintenanceTypeDlg" Control="RemoveButton" Event="NewDialog" Value="VerifyReadyDlg">1</Publish>
+ <Publish Dialog="MaintenanceTypeDlg" Control="Back" Event="NewDialog" Value="MaintenanceWelcomeDlg">1</Publish>
+
+ <Publish Dialog="MyWelcomeDlg" Control="Install" Event="NewDialog" Value="PrepareDlg">1</Publish>
+ <Publish Dialog="VerifyReadyDlg" Control="Back" Event="NewDialog" Value="WelcomeDlg" Order="2">Installed AND PATCH</Publish>
+
+ <InstallUISequence>
+ <Show Dialog="WelcomeDlg" Before="ProgressDlg">0</Show>
+ <Show Dialog="MyWelcomeDlg" Before="ProgressDlg">NOT Installed</Show>
+ </InstallUISequence>
+
+ <Property Id="ARPNOMODIFY" Value="1" />
+ </UI>
+
+ <UIRef Id="WixUI_Common" />
+ </Fragment>
+</Wix> \ No newline at end of file
diff --git a/os/windows/WixUI_fio.wxl b/os/windows/WixUI_fio.wxl
new file mode 100755
index 00000000..11ec736a
--- /dev/null
+++ b/os/windows/WixUI_fio.wxl
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Copyright (c) .NET Foundation and contributors. All rights reserved. Licensed under the Microsoft Reciprocal License. See LICENSE.TXT file in the project root for full license information. -->
+
+
+<WixLocalization Culture="en-US" Codepage="1252" xmlns="http://schemas.microsoft.com/wix/2006/localization">
+ <!-- _locID@Culture="en-US" _locComment="American English" -->
+ <!-- _locID@Codepage="1252" _locComment="Windows-1252" -->
+
+<String Id="MyWelcomeDlgDescription" Overridable="yes">
+<!-- _locID_text="MyWelcomeDlgDescription" _locComment="MyWelcomeDlgDescription" -->The Setup Wizard will install [ProductName] on your computer. Click Install to continue or Cancel to exit the Setup Wizard.
+</String>
+</WixLocalization> \ No newline at end of file
diff --git a/os/windows/cpu-affinity.c b/os/windows/cpu-affinity.c
index 69997b24..7601970f 100644
--- a/os/windows/cpu-affinity.c
+++ b/os/windows/cpu-affinity.c
@@ -2,78 +2,6 @@
#include <windows.h>
-#ifdef CONFIG_WINDOWS_XP
-int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
-{
- HANDLE h;
- BOOL bSuccess = FALSE;
-
- h = OpenThread(THREAD_QUERY_INFORMATION | THREAD_SET_INFORMATION, TRUE,
- pid);
- if (h != NULL) {
- bSuccess = SetThreadAffinityMask(h, cpumask);
- if (!bSuccess)
- log_err("fio_setaffinity failed: failed to set thread affinity (pid %d, mask %.16llx)\n",
- pid, cpumask);
-
- CloseHandle(h);
- } else {
- log_err("fio_setaffinity failed: failed to get handle for pid %d\n",
- pid);
- }
-
- return bSuccess ? 0 : -1;
-}
-
-int fio_getaffinity(int pid, os_cpu_mask_t *mask)
-{
- os_cpu_mask_t systemMask;
-
- HANDLE h = OpenProcess(PROCESS_QUERY_INFORMATION, TRUE, pid);
-
- if (h != NULL) {
- GetProcessAffinityMask(h, mask, &systemMask);
- CloseHandle(h);
- } else {
- log_err("fio_getaffinity failed: failed to get handle for pid %d\n",
- pid);
- return -1;
- }
-
- return 0;
-}
-
-void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
-{
- *mask &= ~(1ULL << cpu);
-}
-
-void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
-{
- *mask |= 1ULL << cpu;
-}
-
-int fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
-{
- return (*mask & (1ULL << cpu)) != 0;
-}
-
-int fio_cpu_count(os_cpu_mask_t *mask)
-{
- return hweight64(*mask);
-}
-
-int fio_cpuset_init(os_cpu_mask_t *mask)
-{
- *mask = 0;
- return 0;
-}
-
-int fio_cpuset_exit(os_cpu_mask_t *mask)
-{
- return 0;
-}
-#else /* CONFIG_WINDOWS_XP */
/* Return all processors regardless of processor group */
unsigned int cpus_online(void)
{
@@ -83,7 +11,7 @@ unsigned int cpus_online(void)
static void print_mask(os_cpu_mask_t *cpumask)
{
for (int i = 0; i < FIO_CPU_MASK_ROWS; i++)
- dprint(FD_PROCESS, "cpumask[%d]=%lu\n", i, cpumask->row[i]);
+ dprint(FD_PROCESS, "cpumask[%d]=%" PRIu64 "\n", i, cpumask->row[i]);
}
/* Return the index of the least significant set CPU in cpumask or -1 if no
@@ -99,7 +27,7 @@ int first_set_cpu(os_cpu_mask_t *cpumask)
int row_first_cpu;
row_first_cpu = __builtin_ffsll(cpumask->row[row]) - 1;
- dprint(FD_PROCESS, "row_first_cpu=%d cpumask->row[%d]=%lu\n",
+ dprint(FD_PROCESS, "row_first_cpu=%d cpumask->row[%d]=%" PRIu64 "\n",
row_first_cpu, row, cpumask->row[row]);
if (row_first_cpu > -1) {
mask_first_cpu = cpus_offset + row_first_cpu;
@@ -136,7 +64,7 @@ static int last_set_cpu(os_cpu_mask_t *cpumask)
row_last_cpu++;
}
- dprint(FD_PROCESS, "row_last_cpu=%d cpumask->row[%d]=%lu\n",
+ dprint(FD_PROCESS, "row_last_cpu=%d cpumask->row[%d]=%" PRIu64 "\n",
row_last_cpu, row, cpumask->row[row]);
if (row_last_cpu > -1) {
mask_last_cpu = cpus_offset + row_last_cpu;
@@ -213,13 +141,17 @@ static int mask_to_group_mask(os_cpu_mask_t *cpumask, int *processor_group, uint
needed_shift = FIO_CPU_MASK_STRIDE - bit_offset;
needed_mask_shift = FIO_CPU_MASK_STRIDE - needed;
needed_mask = (uint64_t)-1 >> needed_mask_shift;
- dprint(FD_PROCESS, "bit_offset=%d end=%d needed=%d needed_shift=%d needed_mask=%ld needed_mask_shift=%d\n", bit_offset, end, needed, needed_shift, needed_mask, needed_mask_shift);
+ dprint(FD_PROCESS,
+ "bit_offset=%d end=%d needed=%d needed_shift=%d needed_mask=%" PRIu64 "needed_mask_shift=%d\n",
+ bit_offset, end, needed, needed_shift, needed_mask,
+ needed_mask_shift);
group_cpumask |= (cpumask->row[row + 1] & needed_mask) << needed_shift;
}
group_cpumask &= (uint64_t)-1 >> (FIO_CPU_MASK_STRIDE - group_size);
/* Return group and mask */
- dprint(FD_PROCESS, "Returning group=%d group_mask=%lu\n", group, group_cpumask);
+ dprint(FD_PROCESS, "Returning group=%d group_mask=%" PRIu64 "\n",
+ group, group_cpumask);
*processor_group = group;
*affinity_mask = group_cpumask;
@@ -257,10 +189,8 @@ int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
if (SetThreadGroupAffinity(handle, &new_group_affinity, NULL) != 0)
ret = 0;
else {
- log_err("fio_setaffinity: failed to set thread affinity "
- "(pid %d, group %d, mask %" PRIx64 ", "
- "GetLastError=%d)\n", pid, group, group_mask,
- GetLastError());
+ log_err("fio_setaffinity: failed to set thread affinity (pid %d, group %d, mask %" PRIx64 ", GetLastError=%lu)\n",
+ pid, group, group_mask, GetLastError());
goto err;
}
@@ -319,7 +249,7 @@ int fio_getaffinity(int pid, os_cpu_mask_t *mask)
goto err;
}
if (!GetProcessGroupAffinity(handle, &group_count, current_groups)) {
- log_err("%s: failed to get single group affinity for pid %d (%d)\n",
+ log_err("%s: failed to get single group affinity for pid %d (%lu)\n",
__func__, pid, GetLastError());
goto err;
}
@@ -329,7 +259,7 @@ int fio_getaffinity(int pid, os_cpu_mask_t *mask)
goto err;
}
if (!GetProcessAffinityMask(handle, &process_mask, &system_mask)) {
- log_err("%s: GetProcessAffinityMask() failed for pid\n",
+ log_err("%s: GetProcessAffinityMask() failed for pid %d\n",
__func__, pid);
goto err;
}
@@ -441,4 +371,3 @@ int fio_cpuset_exit(os_cpu_mask_t *mask)
{
return 0;
}
-#endif /* CONFIG_WINDOWS_XP */
diff --git a/os/windows/dobuild.cmd b/os/windows/dobuild.cmd
index d06a2afa..7b9cb1dd 100644
--- a/os/windows/dobuild.cmd
+++ b/os/windows/dobuild.cmd
@@ -34,13 +34,22 @@ if defined SIGN_FIO (
signtool sign /as /n "%SIGNING_CN%" /tr http://timestamp.digicert.com /td sha256 /fd sha256 ..\..\t\*.exe
)
-"%WIX%bin\candle" -nologo -arch %FIO_ARCH% -dFioVersionNumbers="%FIO_VERSION_NUMBERS%" install.wxs
+if exist ..\..\fio.pdb (
+ set FIO_PDB=true
+) else (
+ set FIO_PDB=false
+)
+
+"%WIX%bin\candle" -nologo -arch %FIO_ARCH% -dFioVersionNumbers="%FIO_VERSION_NUMBERS%" -dFioPDB="%FIO_PDB%" install.wxs
@if ERRORLEVEL 1 goto end
"%WIX%bin\candle" -nologo -arch %FIO_ARCH% examples.wxs
@if ERRORLEVEL 1 goto end
-"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi
+"%WIX%bin\candle" -nologo -arch %FIO_ARCH% WixUI_Minimal_NoEULA.wxs
+@if ERRORLEVEL 1 goto end
+
+"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj WixUI_Minimal_NoEULA.wixobj -loc WixUI_fio.wxl -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi
:end
if defined SIGN_FIO (
signtool sign /n "%SIGNING_CN%" /tr http://timestamp.digicert.com /td sha256 /fd sha256 %FIO_VERSION%-%FIO_ARCH%.msi
-) \ No newline at end of file
+)
diff --git a/os/windows/eula.rtf b/os/windows/eula.rtf
deleted file mode 100755
index a931017c..00000000
--- a/os/windows/eula.rtf
+++ /dev/null
Binary files differ
diff --git a/os/windows/install.wxs b/os/windows/install.wxs
index dcb8c92c..7773bb3b 100755
--- a/os/windows/install.wxs
+++ b/os/windows/install.wxs
@@ -27,6 +27,11 @@
<File Source="..\..\fio.exe"/>
<Environment Action="set" Part="last" Id="PATH" Name="PATH" Value="[INSTALLDIR]fio\" System="yes"/>
</Component>
+ <?if $(var.FioPDB) = true?>
+ <Component>
+ <File Id="fio.pdb" Name="fio.pdb" Source="..\..\fio.pdb"/>
+ </Component>
+ <?endif?>
<Component>
<File Id="README" Name="README.txt" Source="..\..\README"/>
</Component>
@@ -76,6 +81,9 @@
<Feature Id="AlwaysInstall" Absent="disallow" ConfigurableDirectory="INSTALLDIR" Display="hidden" Level="1" Title="Flexible I/O Tester">
<ComponentRef Id="fio.exe"/>
+ <?if $(var.FioPDB) = true?>
+ <ComponentRef Id="fio.pdb"/>
+ <?endif?>
<ComponentRef Id="HOWTO"/>
<ComponentRef Id="README"/>
<ComponentRef Id="REPORTING_BUGS"/>
@@ -99,7 +107,7 @@
<WixVariable Id="WixUILicenseRtf" Value="eula.rtf" />
- <UIRef Id="WixUI_Minimal"/>
+ <UIRef Id="WixUI_Minimal_NoEULA"/>
<MajorUpgrade AllowDowngrades="no" DowngradeErrorMessage="A newer version of the application is already installed."
AllowSameVersionUpgrades="yes"/>
diff --git a/os/windows/posix.c b/os/windows/posix.c
index 31271de0..09c2e4a7 100644
--- a/os/windows/posix.c
+++ b/os/windows/posix.c
@@ -168,7 +168,7 @@ int win_to_posix_error(DWORD winerr)
case ERROR_FILE_INVALID:
return ENXIO;
default:
- log_err("fio: windows error %d not handled\n", winerr);
+ log_err("fio: windows error %lu not handled\n", winerr);
return EIO;
}
@@ -188,7 +188,8 @@ int GetNumLogicalProcessors(void)
if (error == ERROR_INSUFFICIENT_BUFFER)
processor_info = malloc(len);
else {
- log_err("Error: GetLogicalProcessorInformation failed: %d\n", error);
+ log_err("Error: GetLogicalProcessorInformation failed: %lu\n",
+ error);
return -1;
}
@@ -1025,90 +1026,3 @@ in_addr_t inet_network(const char *cp)
hbo = ((nbo & 0xFF) << 24) + ((nbo & 0xFF00) << 8) + ((nbo & 0xFF0000) >> 8) + ((nbo & 0xFF000000) >> 24);
return hbo;
}
-
-#ifdef CONFIG_WINDOWS_XP
-const char *inet_ntop(int af, const void *restrict src, char *restrict dst,
- socklen_t size)
-{
- INT status = SOCKET_ERROR;
- WSADATA wsd;
- char *ret = NULL;
-
- if (af != AF_INET && af != AF_INET6) {
- errno = EAFNOSUPPORT;
- return NULL;
- }
-
- WSAStartup(MAKEWORD(2,2), &wsd);
-
- if (af == AF_INET) {
- struct sockaddr_in si;
- DWORD len = size;
-
- memset(&si, 0, sizeof(si));
- si.sin_family = af;
- memcpy(&si.sin_addr, src, sizeof(si.sin_addr));
- status = WSAAddressToString((struct sockaddr*)&si, sizeof(si), NULL, dst, &len);
- } else if (af == AF_INET6) {
- struct sockaddr_in6 si6;
- DWORD len = size;
-
- memset(&si6, 0, sizeof(si6));
- si6.sin6_family = af;
- memcpy(&si6.sin6_addr, src, sizeof(si6.sin6_addr));
- status = WSAAddressToString((struct sockaddr*)&si6, sizeof(si6), NULL, dst, &len);
- }
-
- if (status != SOCKET_ERROR)
- ret = dst;
- else
- errno = ENOSPC;
-
- WSACleanup();
-
- return ret;
-}
-
-int inet_pton(int af, const char *restrict src, void *restrict dst)
-{
- INT status = SOCKET_ERROR;
- WSADATA wsd;
- int ret = 1;
-
- if (af != AF_INET && af != AF_INET6) {
- errno = EAFNOSUPPORT;
- return -1;
- }
-
- WSAStartup(MAKEWORD(2,2), &wsd);
-
- if (af == AF_INET) {
- struct sockaddr_in si;
- INT len = sizeof(si);
-
- memset(&si, 0, sizeof(si));
- si.sin_family = af;
- status = WSAStringToAddressA((char*)src, af, NULL, (struct sockaddr*)&si, &len);
- if (status != SOCKET_ERROR)
- memcpy(dst, &si.sin_addr, sizeof(si.sin_addr));
- } else if (af == AF_INET6) {
- struct sockaddr_in6 si6;
- INT len = sizeof(si6);
-
- memset(&si6, 0, sizeof(si6));
- si6.sin6_family = af;
- status = WSAStringToAddressA((char*)src, af, NULL, (struct sockaddr*)&si6, &len);
- if (status != SOCKET_ERROR)
- memcpy(dst, &si6.sin6_addr, sizeof(si6.sin6_addr));
- }
-
- if (status == SOCKET_ERROR) {
- errno = ENOSPC;
- ret = 0;
- }
-
- WSACleanup();
-
- return ret;
-}
-#endif /* CONFIG_WINDOWS_XP */
diff --git a/os/windows/posix/include/arpa/inet.h b/os/windows/posix/include/arpa/inet.h
index 056f1dd5..1024db37 100644
--- a/os/windows/posix/include/arpa/inet.h
+++ b/os/windows/posix/include/arpa/inet.h
@@ -12,10 +12,4 @@ typedef int in_addr_t;
in_addr_t inet_network(const char *cp);
-#ifdef CONFIG_WINDOWS_XP
-const char *inet_ntop(int af, const void *restrict src,
- char *restrict dst, socklen_t size);
-int inet_pton(int af, const char *restrict src, void *restrict dst);
-#endif
-
#endif /* ARPA_INET_H */
diff --git a/os/windows/posix/include/poll.h b/os/windows/posix/include/poll.h
index 25b8183f..5099cf2e 100644
--- a/os/windows/posix/include/poll.h
+++ b/os/windows/posix/include/poll.h
@@ -5,20 +5,6 @@
typedef int nfds_t;
-#ifdef CONFIG_WINDOWS_XP
-struct pollfd
-{
- int fd;
- short events;
- short revents;
-};
-
-#define POLLOUT 1
-#define POLLIN 2
-#define POLLERR 0
-#define POLLHUP 1
-#endif /* CONFIG_WINDOWS_XP */
-
int poll(struct pollfd fds[], nfds_t nfds, int timeout);
#endif /* POLL_H */
diff --git a/oslib/blkzoned.h b/oslib/blkzoned.h
index 4cc071dc..719b041d 100644
--- a/oslib/blkzoned.h
+++ b/oslib/blkzoned.h
@@ -16,6 +16,8 @@ extern int blkzoned_report_zones(struct thread_data *td,
struct zbd_zone *zones, unsigned int nr_zones);
extern int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f,
uint64_t offset, uint64_t length);
+extern int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones);
#else
/*
* Define stubs for systems that do not have zoned block device support.
@@ -44,6 +46,11 @@ static inline int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f,
{
return -EIO;
}
+static inline int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ return -EIO;
+}
#endif
#endif /* FIO_BLKZONED_H */
diff --git a/oslib/getopt_long.c b/oslib/getopt_long.c
index 8ec77413..463919fb 100644
--- a/oslib/getopt_long.c
+++ b/oslib/getopt_long.c
@@ -16,8 +16,8 @@
#include "getopt.h"
-char *optarg = NULL;
-int optind = 0, opterr = 0, optopt = 0;
+char *optarg;
+int optind, opterr, optopt;
static struct getopt_private_state {
const char *optptr;
diff --git a/oslib/libmtd.c b/oslib/libmtd.c
index 385b9d2f..5fca3a01 100644
--- a/oslib/libmtd.c
+++ b/oslib/libmtd.c
@@ -35,6 +35,8 @@
#include <sys/ioctl.h>
#include <inttypes.h>
+#include "../compiler/compiler.h"
+
#include <mtd/mtd-user.h>
#include "libmtd.h"
@@ -960,7 +962,7 @@ int mtd_torture(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb)
void *buf;
normsg("run torture test for PEB %d", eb);
- patt_count = ARRAY_SIZE(patterns);
+ patt_count = FIO_ARRAY_SIZE(patterns);
buf = xmalloc(mtd->eb_size);
diff --git a/oslib/libmtd_common.h b/oslib/libmtd_common.h
index 4ed9f0ba..db0494dd 100644
--- a/oslib/libmtd_common.h
+++ b/oslib/libmtd_common.h
@@ -47,7 +47,6 @@ extern "C" {
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif
#define min(a, b) MIN(a, b) /* glue for linux kernel source */
-#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1)
#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c
index 0a8a577a..4e441d29 100644
--- a/oslib/linux-blkzoned.c
+++ b/oslib/linux-blkzoned.c
@@ -24,6 +24,37 @@
#include <linux/blkzoned.h>
/*
+ * If the uapi headers installed on the system lacks zone capacity support,
+ * use our local versions. If the installed headers are recent enough to
+ * support zone capacity, do not redefine any structs.
+ */
+#ifndef CONFIG_HAVE_REP_CAPACITY
+#define BLK_ZONE_REP_CAPACITY (1 << 0)
+
+struct blk_zone_v2 {
+ __u64 start; /* Zone start sector */
+ __u64 len; /* Zone length in number of sectors */
+ __u64 wp; /* Zone write pointer position */
+ __u8 type; /* Zone type */
+ __u8 cond; /* Zone condition */
+ __u8 non_seq; /* Non-sequential write resources active */
+ __u8 reset; /* Reset write pointer recommended */
+ __u8 resv[4];
+ __u64 capacity; /* Zone capacity in number of sectors */
+ __u8 reserved[24];
+};
+#define blk_zone blk_zone_v2
+
+struct blk_zone_report_v2 {
+ __u64 sector;
+ __u32 nr_zones;
+ __u32 flags;
+struct blk_zone zones[0];
+};
+#define blk_zone_report blk_zone_report_v2
+#endif /* CONFIG_HAVE_REP_CAPACITY */
+
+/*
* Read up to 255 characters from the first line of a file. Strip the trailing
* newline.
*/
@@ -43,12 +74,16 @@ static char *read_file(const char *path)
return strdup(line);
}
-int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
- enum zbd_zoned_model *model)
+/*
+ * Get the value of a sysfs attribute for a block device.
+ *
+ * Returns NULL on failure.
+ * Returns a pointer to a string on success.
+ * The caller is responsible for freeing the memory.
+ */
+static char *blkzoned_get_sysfs_attr(const char *file_name, const char *attr)
{
- const char *file_name = f->file_name;
- char *zoned_attr_path = NULL;
- char *model_str = NULL;
+ char *attr_path = NULL;
struct stat statbuf;
char *sys_devno_path = NULL;
char *part_attr_path = NULL;
@@ -56,13 +91,7 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
char sys_path[PATH_MAX];
ssize_t sz;
char *delim = NULL;
-
- if (f->filetype != FIO_TYPE_BLOCK) {
- *model = ZBD_IGNORE;
- return 0;
- }
-
- *model = ZBD_NONE;
+ char *attr_str = NULL;
if (stat(file_name, &statbuf) < 0)
goto out;
@@ -92,34 +121,71 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
*delim = '\0';
}
- if (asprintf(&zoned_attr_path,
- "/sys/dev/block/%s/queue/zoned", sys_path) < 0)
+ if (asprintf(&attr_path,
+ "/sys/dev/block/%s/%s", sys_path, attr) < 0)
goto out;
- model_str = read_file(zoned_attr_path);
+ attr_str = read_file(attr_path);
+out:
+ free(attr_path);
+ free(part_str);
+ free(part_attr_path);
+ free(sys_devno_path);
+
+ return attr_str;
+}
+
+int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
+ enum zbd_zoned_model *model)
+{
+ char *model_str = NULL;
+
+ if (f->filetype != FIO_TYPE_BLOCK)
+ return -EINVAL;
+
+ *model = ZBD_NONE;
+
+ model_str = blkzoned_get_sysfs_attr(f->file_name, "queue/zoned");
if (!model_str)
- goto out;
- dprint(FD_ZBD, "%s: zbd model string: %s\n", file_name, model_str);
+ return 0;
+
+ dprint(FD_ZBD, "%s: zbd model string: %s\n", f->file_name, model_str);
if (strcmp(model_str, "host-aware") == 0)
*model = ZBD_HOST_AWARE;
else if (strcmp(model_str, "host-managed") == 0)
*model = ZBD_HOST_MANAGED;
-out:
+
free(model_str);
- free(zoned_attr_path);
- free(part_str);
- free(part_attr_path);
- free(sys_devno_path);
+
+ return 0;
+}
+
+int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ char *max_open_str;
+
+ if (f->filetype != FIO_TYPE_BLOCK)
+ return -EIO;
+
+ max_open_str = blkzoned_get_sysfs_attr(f->file_name, "queue/max_open_zones");
+ if (!max_open_str)
+ return 0;
+
+ dprint(FD_ZBD, "%s: max open zones supported by device: %s\n",
+ f->file_name, max_open_str);
+ *max_open_zones = atoll(max_open_str);
+
+ free(max_open_str);
+
return 0;
}
static uint64_t zone_capacity(struct blk_zone_report *hdr,
struct blk_zone *blkz)
{
-#ifdef CONFIG_HAVE_REP_CAPACITY
if (hdr->flags & BLK_ZONE_REP_CAPACITY)
return blkz->capacity << 9;
-#endif
return blkz->len << 9;
}
@@ -203,7 +269,7 @@ int blkzoned_report_zones(struct thread_data *td, struct fio_file *f,
default:
/* Treat all these conditions as offline (don't use!) */
z->cond = ZBD_ZONE_COND_OFFLINE;
- break;
+ z->wp = z->start;
}
}
diff --git a/parse.c b/parse.c
index f4cefcf6..45f4f2d3 100644
--- a/parse.c
+++ b/parse.c
@@ -37,6 +37,7 @@ static const char *opt_type_names[] = {
"OPT_BOOL",
"OPT_FLOAT_LIST",
"OPT_STR_SET",
+ "OPT_STR_VAL_ZONE",
"OPT_DEPRECATED",
"OPT_SOFT_DEPRECATED",
"OPT_UNSUPPORTED",
@@ -501,7 +502,7 @@ static int str_match_len(const struct value_pair *vp, const char *str)
static const char *opt_type_name(const struct fio_option *o)
{
- compiletime_assert(ARRAY_SIZE(opt_type_names) - 1 == FIO_OPT_UNSUPPORTED,
+ compiletime_assert(FIO_ARRAY_SIZE(opt_type_names) - 1 == FIO_OPT_UNSUPPORTED,
"opt_type_names[] index");
if (o->type <= FIO_OPT_UNSUPPORTED)
@@ -599,9 +600,35 @@ static int __handle_option(const struct fio_option *o, const char *ptr,
fallthrough;
case FIO_OPT_ULL:
case FIO_OPT_INT:
- case FIO_OPT_STR_VAL: {
+ case FIO_OPT_STR_VAL:
+ case FIO_OPT_STR_VAL_ZONE:
+ {
fio_opt_str_val_fn *fn = o->cb;
char tmp[128], *p;
+ size_t len = strlen(ptr);
+
+ if (len > 0 && ptr[len - 1] == 'z') {
+ if (o->type == FIO_OPT_STR_VAL_ZONE) {
+ char *ep;
+ unsigned long long val;
+
+ errno = 0;
+ val = strtoul(ptr, &ep, 10);
+ if (errno == 0 && ep != ptr && *ep == 'z') {
+ ull = ZONE_BASE_VAL + (uint32_t)val;
+ ret = 0;
+ goto store_option_value;
+ } else {
+ log_err("%s: unexpected zone value '%s'\n",
+ o->name, ptr);
+ return 1;
+ }
+ } else {
+ log_err("%s: 'z' suffix isn't applicable\n",
+ o->name);
+ return 1;
+ }
+ }
if (!is_time && o->is_time)
is_time = o->is_time;
@@ -655,6 +682,7 @@ static int __handle_option(const struct fio_option *o, const char *ptr,
}
}
+store_option_value:
if (fn)
ret = fn(data, &ull);
else {
@@ -786,6 +814,11 @@ static int __handle_option(const struct fio_option *o, const char *ptr,
if (o->off1) {
cp = td_var(data, o, o->off1);
*cp = strdup(ptr);
+ if (strlen(ptr) > o->maxlen - 1) {
+ log_err("value exceeds max length of %d\n",
+ o->maxlen);
+ return 1;
+ }
}
if (fn)
diff --git a/parse.h b/parse.h
index 1d2cbf74..d68484ea 100644
--- a/parse.h
+++ b/parse.h
@@ -21,6 +21,7 @@ enum fio_opt_type {
FIO_OPT_BOOL,
FIO_OPT_FLOAT_LIST,
FIO_OPT_STR_SET,
+ FIO_OPT_STR_VAL_ZONE,
FIO_OPT_DEPRECATED,
FIO_OPT_SOFT_DEPRECATED,
FIO_OPT_UNSUPPORTED, /* keep this last */
@@ -125,17 +126,23 @@ static inline void *td_var(void *to, const struct fio_option *o,
else
ret = to;
- return ret + offset;
+ return (void *) ((uintptr_t) ret + offset);
}
static inline int parse_is_percent(unsigned long long val)
{
- return val <= -1ULL && val >= (-1ULL - 100ULL);
+ return val >= -101ULL;
}
+#define ZONE_BASE_VAL ((-1ULL >> 1) + 1)
static inline int parse_is_percent_uncapped(unsigned long long val)
{
- return (long long)val <= -1;
+ return ZONE_BASE_VAL + -1U < val;
+}
+
+static inline int parse_is_zone(unsigned long long val)
+{
+ return (val - ZONE_BASE_VAL) <= -1U;
}
struct print_option {
diff --git a/server.c b/server.c
index 248a2d44..42eaa4b1 100644
--- a/server.c
+++ b/server.c
@@ -409,8 +409,9 @@ struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait)
if (cmdret->opcode == FIO_NET_CMD_TEXT) {
struct cmd_text_pdu *__pdu = (struct cmd_text_pdu *) cmdret->payload;
char *buf = (char *) __pdu->buf;
+ int len = le32_to_cpu(__pdu->buf_len);
- buf[__pdu->buf_len] = '\0';
+ buf[len] = '\0';
} else if (cmdret->opcode == FIO_NET_CMD_JOB) {
struct cmd_job_pdu *__pdu = (struct cmd_job_pdu *) cmdret->payload;
char *buf = (char *) __pdu->buf;
@@ -950,7 +951,7 @@ static int handle_update_job_cmd(struct fio_net_cmd *cmd)
return 0;
}
- td = &threads[tnumber - 1];
+ td = tnumber_to_td(tnumber);
convert_thread_options_to_cpu(&td->o, &pdu->top);
send_update_job_reply(cmd->tag, 0);
return 0;
@@ -1909,7 +1910,7 @@ static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
break;
}
flist_add_tail(&entry->list, &first->next);
- } while (ret != Z_STREAM_END);
+ }
ret = deflateEnd(&stream);
if (ret == Z_OK)
diff --git a/server.h b/server.h
index 6d444749..daed057a 100644
--- a/server.h
+++ b/server.h
@@ -48,7 +48,7 @@ struct fio_net_cmd_reply {
};
enum {
- FIO_SERVER_VER = 86,
+ FIO_SERVER_VER = 92,
FIO_SERVER_MAX_FRAGMENT_PDU = 1024,
FIO_SERVER_MAX_CMD_MB = 2048,
diff --git a/stat.c b/stat.c
index 7f987c7f..a8a96c85 100644
--- a/stat.c
+++ b/stat.c
@@ -282,6 +282,46 @@ bool calc_lat(struct io_stat *is, unsigned long long *min,
return true;
}
+void show_mixed_group_stats(struct group_run_stats *rs, struct buf_output *out)
+{
+ char *io, *agg, *min, *max;
+ char *ioalt, *aggalt, *minalt, *maxalt;
+ uint64_t io_mix = 0, agg_mix = 0, min_mix = -1, max_mix = 0, min_run = -1, max_run = 0;
+ int i;
+ const int i2p = is_power_of_2(rs->kb_base);
+
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ if (!rs->max_run[i])
+ continue;
+ io_mix += rs->iobytes[i];
+ agg_mix += rs->agg[i];
+ min_mix = min_mix < rs->min_bw[i] ? min_mix : rs->min_bw[i];
+ max_mix = max_mix > rs->max_bw[i] ? max_mix : rs->max_bw[i];
+ min_run = min_run < rs->min_run[i] ? min_run : rs->min_run[i];
+ max_run = max_run > rs->max_run[i] ? max_run : rs->max_run[i];
+ }
+ io = num2str(io_mix, rs->sig_figs, 1, i2p, N2S_BYTE);
+ ioalt = num2str(io_mix, rs->sig_figs, 1, !i2p, N2S_BYTE);
+ agg = num2str(agg_mix, rs->sig_figs, 1, i2p, rs->unit_base);
+ aggalt = num2str(agg_mix, rs->sig_figs, 1, !i2p, rs->unit_base);
+ min = num2str(min_mix, rs->sig_figs, 1, i2p, rs->unit_base);
+ minalt = num2str(min_mix, rs->sig_figs, 1, !i2p, rs->unit_base);
+ max = num2str(max_mix, rs->sig_figs, 1, i2p, rs->unit_base);
+ maxalt = num2str(max_mix, rs->sig_figs, 1, !i2p, rs->unit_base);
+ log_buf(out, " MIXED: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
+ agg, aggalt, min, max, minalt, maxalt, io, ioalt,
+ (unsigned long long) min_run,
+ (unsigned long long) max_run);
+ free(io);
+ free(agg);
+ free(min);
+ free(max);
+ free(ioalt);
+ free(aggalt);
+ free(minalt);
+ free(maxalt);
+}
+
void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
{
char *io, *agg, *min, *max;
@@ -306,7 +346,7 @@ void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
max = num2str(rs->max_bw[i], rs->sig_figs, 1, i2p, rs->unit_base);
maxalt = num2str(rs->max_bw[i], rs->sig_figs, 1, !i2p, rs->unit_base);
log_buf(out, "%s: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
- rs->unified_rw_rep ? " MIXED" : str[i],
+ (rs->unified_rw_rep == UNIFIED_MIXED) ? " MIXED" : str[i],
agg, aggalt, min, max, minalt, maxalt, io, ioalt,
(unsigned long long) rs->min_run[i],
(unsigned long long) rs->max_run[i]);
@@ -320,6 +360,10 @@ void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
free(minalt);
free(maxalt);
}
+
+ /* Need to aggregate statisitics to show mixed values */
+ if (rs->unified_rw_rep == UNIFIED_BOTH)
+ show_mixed_group_stats(rs, out);
}
void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist)
@@ -418,7 +462,7 @@ static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, i
{
double p_of_agg = 100.0;
if (rs && rs->agg[ddir] > 1024) {
- p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024.0);
+ p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0);
if (p_of_agg > 100.0)
p_of_agg = 100.0;
@@ -426,6 +470,168 @@ static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, i
return p_of_agg;
}
+static void show_mixed_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
+ struct buf_output *out)
+{
+ unsigned long runt;
+ unsigned long long min, max, bw, iops;
+ double mean, dev;
+ char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
+ struct thread_stat *ts_lcl;
+
+ int i2p;
+ int ddir = 0, i;
+
+ /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
+ ts_lcl = malloc(sizeof(struct thread_stat));
+ memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
+ ts_lcl->unified_rw_rep = UNIFIED_MIXED; /* calculate mixed stats */
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ ts_lcl->clat_stat[i].min_val = ULONG_MAX;
+ ts_lcl->slat_stat[i].min_val = ULONG_MAX;
+ ts_lcl->lat_stat[i].min_val = ULONG_MAX;
+ ts_lcl->bw_stat[i].min_val = ULONG_MAX;
+ ts_lcl->iops_stat[i].min_val = ULONG_MAX;
+ ts_lcl->clat_high_prio_stat[i].min_val = ULONG_MAX;
+ ts_lcl->clat_low_prio_stat[i].min_val = ULONG_MAX;
+ }
+ ts_lcl->sync_stat.min_val = ULONG_MAX;
+
+ sum_thread_stats(ts_lcl, ts, 1);
+
+ assert(ddir_rw(ddir));
+
+ if (!ts_lcl->runtime[ddir])
+ return;
+
+ i2p = is_power_of_2(rs->kb_base);
+ runt = ts_lcl->runtime[ddir];
+
+ bw = (1000 * ts_lcl->io_bytes[ddir]) / runt;
+ io_p = num2str(ts_lcl->io_bytes[ddir], ts->sig_figs, 1, i2p, N2S_BYTE);
+ bw_p = num2str(bw, ts->sig_figs, 1, i2p, ts->unit_base);
+ bw_p_alt = num2str(bw, ts->sig_figs, 1, !i2p, ts->unit_base);
+
+ iops = (1000 * ts_lcl->total_io_u[ddir]) / runt;
+ iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_NONE);
+
+ log_buf(out, " mixed: IOPS=%s, BW=%s (%s)(%s/%llumsec)%s\n",
+ iops_p, bw_p, bw_p_alt, io_p,
+ (unsigned long long) ts_lcl->runtime[ddir],
+ post_st ? : "");
+
+ free(post_st);
+ free(io_p);
+ free(bw_p);
+ free(bw_p_alt);
+ free(iops_p);
+
+ if (calc_lat(&ts_lcl->slat_stat[ddir], &min, &max, &mean, &dev))
+ display_lat("slat", min, max, mean, dev, out);
+ if (calc_lat(&ts_lcl->clat_stat[ddir], &min, &max, &mean, &dev))
+ display_lat("clat", min, max, mean, dev, out);
+ if (calc_lat(&ts_lcl->lat_stat[ddir], &min, &max, &mean, &dev))
+ display_lat(" lat", min, max, mean, dev, out);
+ if (calc_lat(&ts_lcl->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) {
+ display_lat(ts_lcl->lat_percentiles ? "high prio_lat" : "high prio_clat",
+ min, max, mean, dev, out);
+ if (calc_lat(&ts_lcl->clat_low_prio_stat[ddir], &min, &max, &mean, &dev))
+ display_lat(ts_lcl->lat_percentiles ? "low prio_lat" : "low prio_clat",
+ min, max, mean, dev, out);
+ }
+
+ if (ts->slat_percentiles && ts_lcl->slat_stat[ddir].samples > 0)
+ show_clat_percentiles(ts_lcl->io_u_plat[FIO_SLAT][ddir],
+ ts_lcl->slat_stat[ddir].samples,
+ ts->percentile_list,
+ ts->percentile_precision, "slat", out);
+ if (ts->clat_percentiles && ts_lcl->clat_stat[ddir].samples > 0)
+ show_clat_percentiles(ts_lcl->io_u_plat[FIO_CLAT][ddir],
+ ts_lcl->clat_stat[ddir].samples,
+ ts->percentile_list,
+ ts->percentile_precision, "clat", out);
+ if (ts->lat_percentiles && ts_lcl->lat_stat[ddir].samples > 0)
+ show_clat_percentiles(ts_lcl->io_u_plat[FIO_LAT][ddir],
+ ts_lcl->lat_stat[ddir].samples,
+ ts->percentile_list,
+ ts->percentile_precision, "lat", out);
+
+ if (ts->clat_percentiles || ts->lat_percentiles) {
+ const char *name = ts->lat_percentiles ? "lat" : "clat";
+ char prio_name[32];
+ uint64_t samples;
+
+ if (ts->lat_percentiles)
+ samples = ts_lcl->lat_stat[ddir].samples;
+ else
+ samples = ts_lcl->clat_stat[ddir].samples;
+
+ /* Only print this if some high and low priority stats were collected */
+ if (ts_lcl->clat_high_prio_stat[ddir].samples > 0 &&
+ ts_lcl->clat_low_prio_stat[ddir].samples > 0)
+ {
+ sprintf(prio_name, "high prio (%.2f%%) %s",
+ 100. * (double) ts_lcl->clat_high_prio_stat[ddir].samples / (double) samples,
+ name);
+ show_clat_percentiles(ts_lcl->io_u_plat_high_prio[ddir],
+ ts_lcl->clat_high_prio_stat[ddir].samples,
+ ts->percentile_list,
+ ts->percentile_precision, prio_name, out);
+
+ sprintf(prio_name, "low prio (%.2f%%) %s",
+ 100. * (double) ts_lcl->clat_low_prio_stat[ddir].samples / (double) samples,
+ name);
+ show_clat_percentiles(ts_lcl->io_u_plat_low_prio[ddir],
+ ts_lcl->clat_low_prio_stat[ddir].samples,
+ ts->percentile_list,
+ ts->percentile_precision, prio_name, out);
+ }
+ }
+
+ if (calc_lat(&ts_lcl->bw_stat[ddir], &min, &max, &mean, &dev)) {
+ double p_of_agg = 100.0, fkb_base = (double)rs->kb_base;
+ const char *bw_str;
+
+ if ((rs->unit_base == 1) && i2p)
+ bw_str = "Kibit";
+ else if (rs->unit_base == 1)
+ bw_str = "kbit";
+ else if (i2p)
+ bw_str = "KiB";
+ else
+ bw_str = "kB";
+
+ p_of_agg = convert_agg_kbytes_percent(rs, ddir, mean);
+
+ if (rs->unit_base == 1) {
+ min *= 8.0;
+ max *= 8.0;
+ mean *= 8.0;
+ dev *= 8.0;
+ }
+
+ if (mean > fkb_base * fkb_base) {
+ min /= fkb_base;
+ max /= fkb_base;
+ mean /= fkb_base;
+ dev /= fkb_base;
+ bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB");
+ }
+
+ log_buf(out, " bw (%5s/s): min=%5llu, max=%5llu, per=%3.2f%%, "
+ "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+ bw_str, min, max, p_of_agg, mean, dev,
+ (&ts_lcl->bw_stat[ddir])->samples);
+ }
+ if (calc_lat(&ts_lcl->iops_stat[ddir], &min, &max, &mean, &dev)) {
+ log_buf(out, " iops : min=%5llu, max=%5llu, "
+ "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+ min, max, mean, dev, (&ts_lcl->iops_stat[ddir])->samples);
+ }
+
+ free(ts_lcl);
+}
+
static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
int ddir, struct buf_output *out)
{
@@ -477,7 +683,7 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
}
log_buf(out, " %s: IOPS=%s, BW=%s (%s)(%s/%llumsec)%s\n",
- rs->unified_rw_rep ? "mixed" : io_ddir_name(ddir),
+ (ts->unified_rw_rep == UNIFIED_MIXED) ? "mixed" : io_ddir_name(ddir),
iops_p, bw_p, bw_p_alt, io_p,
(unsigned long long) ts->runtime[ddir],
post_st ? : "");
@@ -1083,6 +1289,9 @@ static void show_thread_status_normal(struct thread_stat *ts,
show_ddir_status(rs, ts, ddir, out);
}
+ if (ts->unified_rw_rep == UNIFIED_BOTH)
+ show_mixed_ddir_status(rs, ts, out);
+
show_latencies(ts, out);
if (ts->sync_stat.samples)
@@ -1205,7 +1414,7 @@ static void show_ddir_status_terse(struct thread_stat *ts,
&minv);
else
len = 0;
-
+
for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
if (i >= len) {
log_buf(out, ";0%%=0");
@@ -1249,6 +1458,40 @@ static void show_ddir_status_terse(struct thread_stat *ts,
}
}
+static void show_mixed_ddir_status_terse(struct thread_stat *ts,
+ struct group_run_stats *rs,
+ int ver, struct buf_output *out)
+{
+ struct thread_stat *ts_lcl;
+ int i;
+
+ /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
+ ts_lcl = malloc(sizeof(struct thread_stat));
+ memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
+ ts_lcl->unified_rw_rep = UNIFIED_MIXED; /* calculate mixed stats */
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ ts_lcl->clat_stat[i].min_val = ULONG_MAX;
+ ts_lcl->slat_stat[i].min_val = ULONG_MAX;
+ ts_lcl->lat_stat[i].min_val = ULONG_MAX;
+ ts_lcl->bw_stat[i].min_val = ULONG_MAX;
+ ts_lcl->iops_stat[i].min_val = ULONG_MAX;
+ ts_lcl->clat_high_prio_stat[i].min_val = ULONG_MAX;
+ ts_lcl->clat_low_prio_stat[i].min_val = ULONG_MAX;
+ }
+ ts_lcl->sync_stat.min_val = ULONG_MAX;
+ ts_lcl->lat_percentiles = ts->lat_percentiles;
+ ts_lcl->clat_percentiles = ts->clat_percentiles;
+ ts_lcl->slat_percentiles = ts->slat_percentiles;
+ ts_lcl->percentile_precision = ts->percentile_precision;
+ memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
+
+ sum_thread_stats(ts_lcl, ts, 1);
+
+ /* add the aggregated stats to json parent */
+ show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out);
+ free(ts_lcl);
+}
+
static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t percentiles,
struct io_stat *lat_stat, uint64_t *io_u_plat)
{
@@ -1310,12 +1553,12 @@ static void add_ddir_status_json(struct thread_stat *ts,
assert(ddir_rw(ddir) || ddir_sync(ddir));
- if (ts->unified_rw_rep && ddir != DDIR_READ)
+ if ((ts->unified_rw_rep == UNIFIED_MIXED) && ddir != DDIR_READ)
return;
dir_object = json_create_object();
json_object_add_value_object(parent,
- ts->unified_rw_rep ? "mixed" : io_ddir_name(ddir), dir_object);
+ (ts->unified_rw_rep == UNIFIED_MIXED) ? "mixed" : io_ddir_name(ddir), dir_object);
if (ddir_rw(ddir)) {
bw_bytes = 0;
@@ -1418,6 +1661,39 @@ static void add_ddir_status_json(struct thread_stat *ts,
}
}
+static void add_mixed_ddir_status_json(struct thread_stat *ts,
+ struct group_run_stats *rs, struct json_object *parent)
+{
+ struct thread_stat *ts_lcl;
+ int i;
+
+ /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
+ ts_lcl = malloc(sizeof(struct thread_stat));
+ memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
+ ts_lcl->unified_rw_rep = UNIFIED_MIXED; /* calculate mixed stats */
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ ts_lcl->clat_stat[i].min_val = ULONG_MAX;
+ ts_lcl->slat_stat[i].min_val = ULONG_MAX;
+ ts_lcl->lat_stat[i].min_val = ULONG_MAX;
+ ts_lcl->bw_stat[i].min_val = ULONG_MAX;
+ ts_lcl->iops_stat[i].min_val = ULONG_MAX;
+ ts_lcl->clat_high_prio_stat[i].min_val = ULONG_MAX;
+ ts_lcl->clat_low_prio_stat[i].min_val = ULONG_MAX;
+ }
+ ts_lcl->sync_stat.min_val = ULONG_MAX;
+ ts_lcl->lat_percentiles = ts->lat_percentiles;
+ ts_lcl->clat_percentiles = ts->clat_percentiles;
+ ts_lcl->slat_percentiles = ts->slat_percentiles;
+ ts_lcl->percentile_precision = ts->percentile_precision;
+ memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
+
+ sum_thread_stats(ts_lcl, ts, 1);
+
+ /* add the aggregated stats to json parent */
+ add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent);
+ free(ts_lcl);
+}
+
static void show_thread_status_terse_all(struct thread_stat *ts,
struct group_run_stats *rs, int ver,
struct buf_output *out)
@@ -1435,14 +1711,17 @@ static void show_thread_status_terse_all(struct thread_stat *ts,
log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
ts->name, ts->groupid, ts->error);
- /* Log Read Status */
+ /* Log Read Status, or mixed if unified_rw_rep = 1 */
show_ddir_status_terse(ts, rs, DDIR_READ, ver, out);
- /* Log Write Status */
- show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
- /* Log Trim Status */
- if (ver == 2 || ver == 4 || ver == 5)
- show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
-
+ if (ts->unified_rw_rep != UNIFIED_MIXED) {
+ /* Log Write Status */
+ show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
+ /* Log Trim Status */
+ if (ver == 2 || ver == 4 || ver == 5)
+ show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
+ }
+ if (ts->unified_rw_rep == UNIFIED_BOTH)
+ show_mixed_ddir_status_terse(ts, rs, ver, out);
/* CPU Usage */
if (ts->total_run_time) {
double runt = (double) ts->total_run_time;
@@ -1547,6 +1826,9 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
add_ddir_status_json(ts, rs, DDIR_TRIM, root);
add_ddir_status_json(ts, rs, DDIR_SYNC, root);
+ if (ts->unified_rw_rep == UNIFIED_BOTH)
+ add_mixed_ddir_status_json(ts, rs, root);
+
/* CPU Usage */
if (ts->total_run_time) {
double runt = (double) ts->total_run_time;
@@ -1875,7 +2157,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
int k, l, m;
for (l = 0; l < DDIR_RWDIR_CNT; l++) {
- if (!dst->unified_rw_rep) {
+ if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false);
sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], first, false);
sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], first, false);
@@ -1931,7 +2213,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
dst->io_u_lat_m[k] += src->io_u_lat_m[k];
for (k = 0; k < DDIR_RWDIR_CNT; k++) {
- if (!dst->unified_rw_rep) {
+ if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
dst->total_io_u[k] += src->total_io_u[k];
dst->short_io_u[k] += src->short_io_u[k];
dst->drop_io_u[k] += src->drop_io_u[k];
@@ -1947,7 +2229,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
for (k = 0; k < FIO_LAT_CNT; k++)
for (l = 0; l < DDIR_RWDIR_CNT; l++)
for (m = 0; m < FIO_IO_U_PLAT_NR; m++)
- if (!dst->unified_rw_rep)
+ if (!(dst->unified_rw_rep == UNIFIED_MIXED))
dst->io_u_plat[k][l][m] += src->io_u_plat[k][l][m];
else
dst->io_u_plat[k][0][m] += src->io_u_plat[k][l][m];
@@ -1957,7 +2239,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
for (k = 0; k < DDIR_RWDIR_CNT; k++) {
for (m = 0; m < FIO_IO_U_PLAT_NR; m++) {
- if (!dst->unified_rw_rep) {
+ if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
dst->io_u_plat_high_prio[k][m] += src->io_u_plat_high_prio[k][m];
dst->io_u_plat_low_prio[k][m] += src->io_u_plat_low_prio[k][m];
} else {
@@ -2166,7 +2448,7 @@ void __show_run_stats(void)
rs->kb_base = ts->kb_base;
rs->unit_base = ts->unit_base;
rs->sig_figs = ts->sig_figs;
- rs->unified_rw_rep += ts->unified_rw_rep;
+ rs->unified_rw_rep |= ts->unified_rw_rep;
for (j = 0; j < DDIR_RWDIR_CNT; j++) {
if (!ts->runtime[j])
@@ -2299,7 +2581,7 @@ void __show_run_stats(void)
free(opt_lists);
}
-void __show_running_run_stats(void)
+int __show_running_run_stats(void)
{
struct thread_data *td;
unsigned long long *rt;
@@ -2350,6 +2632,8 @@ void __show_running_run_stats(void)
free(rt);
fio_sem_up(stat_sem);
+
+ return 0;
}
static bool status_file_disabled;
@@ -2534,6 +2818,14 @@ void regrow_logs(struct thread_data *td)
td->flags &= ~TD_F_REGROW_LOGS;
}
+void regrow_agg_logs(void)
+{
+ enum fio_ddir ddir;
+
+ for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+ regrow_log(agg_io_log[ddir]);
+}
+
static struct io_logs *get_cur_log(struct io_log *iolog)
{
struct io_logs *cur_log;
@@ -2737,7 +3029,8 @@ static unsigned long add_log_sample(struct thread_data *td,
__add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0, priority_bit);
- iolog->avg_last[ddir] = elapsed - (this_window - iolog->avg_msec);
+ iolog->avg_last[ddir] = elapsed - (elapsed % iolog->avg_msec);
+
return iolog->avg_msec;
}
@@ -2975,7 +3268,7 @@ static int __add_samples(struct thread_data *td, struct timespec *parent_tv,
next_log = avg_time;
spent = mtime_since(parent_tv, t);
- if (spent < avg_time && avg_time - spent >= LOG_MSEC_SLACK)
+ if (spent < avg_time && avg_time - spent > LOG_MSEC_SLACK)
return avg_time - spent;
if (needs_lock)
@@ -3068,13 +3361,16 @@ static int add_iops_samples(struct thread_data *td, struct timespec *t)
int calc_log_samples(void)
{
struct thread_data *td;
- unsigned int next = ~0U, tmp;
+ unsigned int next = ~0U, tmp = 0, next_mod = 0, log_avg_msec_min = -1U;
struct timespec now;
int i;
+ long elapsed_time = 0;
fio_gettime(&now, NULL);
for_each_td(td, i) {
+ elapsed_time = mtime_since_now(&td->epoch);
+
if (!td->o.stats)
continue;
if (in_ramp_time(td) ||
@@ -3085,17 +3381,34 @@ int calc_log_samples(void)
if (!td->bw_log ||
(td->bw_log && !per_unit_log(td->bw_log))) {
tmp = add_bw_samples(td, &now);
- if (tmp < next)
- next = tmp;
+
+ if (td->bw_log)
+ log_avg_msec_min = min(log_avg_msec_min, (unsigned int)td->bw_log->avg_msec);
}
if (!td->iops_log ||
(td->iops_log && !per_unit_log(td->iops_log))) {
tmp = add_iops_samples(td, &now);
- if (tmp < next)
- next = tmp;
+
+ if (td->iops_log)
+ log_avg_msec_min = min(log_avg_msec_min, (unsigned int)td->iops_log->avg_msec);
}
+
+ if (tmp < next)
+ next = tmp;
}
+ /* if log_avg_msec_min has not been changed, set it to 0 */
+ if (log_avg_msec_min == -1U)
+ log_avg_msec_min = 0;
+
+ if (log_avg_msec_min == 0)
+ next_mod = elapsed_time;
+ else
+ next_mod = elapsed_time % log_avg_msec_min;
+
+ /* correction to keep the time on the log avg msec boundary */
+ next = min(next, (log_avg_msec_min - next_mod));
+
return next == ~0U ? 0 : next;
}
diff --git a/stat.h b/stat.h
index 0d141666..d08d4dc0 100644
--- a/stat.h
+++ b/stat.h
@@ -146,6 +146,9 @@ enum block_info_state {
#define FIO_JOBNAME_SIZE 128
#define FIO_JOBDESC_SIZE 256
#define FIO_VERROR_SIZE 128
+#define UNIFIED_SPLIT 0
+#define UNIFIED_MIXED 1
+#define UNIFIED_BOTH 2
enum fio_lat {
FIO_SLAT = 0,
@@ -319,7 +322,7 @@ extern void show_group_stats(struct group_run_stats *rs, struct buf_output *);
extern bool calc_thread_status(struct jobs_eta *je, int force);
extern void display_thread_status(struct jobs_eta *je);
extern void __show_run_stats(void);
-extern void __show_running_run_stats(void);
+extern int __show_running_run_stats(void);
extern void show_running_run_stats(void);
extern void check_for_running_stats(void);
extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
diff --git a/steadystate.c b/steadystate.c
index bd2f70dd..2e3da1db 100644
--- a/steadystate.c
+++ b/steadystate.c
@@ -196,7 +196,7 @@ static bool steadystate_deviation(uint64_t iops, uint64_t bw,
return false;
}
-void steadystate_check(void)
+int steadystate_check(void)
{
int i, j, ddir, prev_groupid, group_ramp_time_over = 0;
unsigned long rate_time;
@@ -302,6 +302,7 @@ void steadystate_check(void)
}
}
}
+ return 0;
}
int td_steadystate_init(struct thread_data *td)
diff --git a/steadystate.h b/steadystate.h
index 51472c46..bbb86fbb 100644
--- a/steadystate.h
+++ b/steadystate.h
@@ -4,7 +4,7 @@
#include "thread_options.h"
extern void steadystate_free(struct thread_data *);
-extern void steadystate_check(void);
+extern int steadystate_check(void);
extern void steadystate_setup(void);
extern int td_steadystate_init(struct thread_data *);
extern uint64_t steadystate_bw_mean(struct thread_stat *);
diff --git a/t/dedupe.c b/t/dedupe.c
index 68d31f19..8b659c76 100644
--- a/t/dedupe.c
+++ b/t/dedupe.c
@@ -473,11 +473,14 @@ static void show_chunk(struct chunk *c)
}
}
-static void show_stat(uint64_t nextents, uint64_t nchunks)
+static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents)
{
double perc, ratio;
- printf("Extents=%lu, Unique extents=%lu\n", (unsigned long) nextents, (unsigned long) nchunks);
+ printf("Extents=%lu, Unique extents=%lu", (unsigned long) nextents, (unsigned long) nchunks);
+ if (!bloom)
+ printf(" Duplicated extents=%lu", (unsigned long) ndupextents);
+ printf("\n");
if (nchunks) {
ratio = (double) nextents / (double) nchunks;
@@ -485,17 +488,20 @@ static void show_stat(uint64_t nextents, uint64_t nchunks)
} else
printf("De-dupe ratio: 1:infinite\n");
+ if (ndupextents)
+ printf("De-dupe working set at least: %3.2f%%\n", 100.0 * (double) ndupextents / (double) nextents);
+
perc = 1.00 - ((double) nchunks / (double) nextents);
perc *= 100.0;
printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
}
-static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
+static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks, uint64_t *ndupextents)
{
struct fio_rb_node *n;
- *nchunks = *nextents = 0;
+ *nchunks = *nextents = *ndupextents = 0;
n = rb_first(&rb_root);
if (!n)
@@ -507,6 +513,7 @@ static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
c = rb_entry(n, struct chunk, rb_node);
(*nchunks)++;
*nextents += c->count;
+ *ndupextents += (c->count > 1);
if (dump_output)
show_chunk(c);
@@ -530,7 +537,7 @@ static int usage(char *argv[])
int main(int argc, char *argv[])
{
- uint64_t nextents = 0, nchunks = 0;
+ uint64_t nextents = 0, nchunks = 0, ndupextents = 0;
int c, ret;
arch_init(argv);
@@ -583,9 +590,9 @@ int main(int argc, char *argv[])
if (!ret) {
if (!bloom)
- iter_rb_tree(&nextents, &nchunks);
+ iter_rb_tree(&nextents, &nchunks, &ndupextents);
- show_stat(nextents, nchunks);
+ show_stat(nextents, nchunks, ndupextents);
}
fio_sem_remove(rb_lock);
diff --git a/t/fuzz/fuzz_parseini.c b/t/fuzz/fuzz_parseini.c
new file mode 100644
index 00000000..7e422c18
--- /dev/null
+++ b/t/fuzz/fuzz_parseini.c
@@ -0,0 +1,41 @@
+#include "fio.h"
+
+static int initialized = 0;
+
+const char *const fakeargv[] = {(char *) "fuzz",
+ (char *) "--output", (char *) "/dev/null",
+ (char *) "--parse-only",
+ 0};
+
+int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
+{
+ char *fuzzedini;
+
+ if (size < 2)
+ return 0;
+
+ if (initialized == 0) {
+ if (fio_init_options()) {
+ printf("Failed fio_init_options\n");
+ return 1;
+ }
+
+ parse_cmd_line(4, (char **) fakeargv, 0);
+ sinit();
+
+ initialized = 1;
+ }
+ fuzzedini = malloc(size);
+ if (!fuzzedini) {
+ printf("Failed malloc\n");
+ return 1;
+ }
+ /* final character is type for parse_jobs_ini */
+ memcpy(fuzzedini, data, size - 1);
+ /* ensures final 0 */
+ fuzzedini[size - 1] = 0;
+
+ parse_jobs_ini(fuzzedini, 1, 0, data[size - 1]);
+ free(fuzzedini);
+ return 0;
+}
diff --git a/t/fuzz/onefile.c b/t/fuzz/onefile.c
new file mode 100644
index 00000000..2ed3bbe6
--- /dev/null
+++ b/t/fuzz/onefile.c
@@ -0,0 +1,51 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+
+int main(int argc, char** argv)
+{
+ FILE *fp;
+ uint8_t *data;
+ size_t size;
+
+ if (argc != 2)
+ return 1;
+
+ /* opens the file, get its size, and reads it into a buffer */
+ fp = fopen(argv[1], "rb");
+ if (fp == NULL)
+ return 2;
+
+ if (fseek(fp, 0L, SEEK_END) != 0) {
+ fclose(fp);
+ return 2;
+ }
+ size = ftell(fp);
+ if (size == (size_t) -1) {
+ fclose(fp);
+ return 2;
+ }
+ if (fseek(fp, 0L, SEEK_SET) != 0) {
+ fclose(fp);
+ return 2;
+ }
+ data = malloc(size);
+ if (data == NULL) {
+ fclose(fp);
+ return 2;
+ }
+ if (fread(data, size, 1, fp) != 1) {
+ fclose(fp);
+ free(data);
+ return 2;
+ }
+
+ /* launch fuzzer */
+ LLVMFuzzerTestOneInput(data, size);
+ free(data);
+ fclose(fp);
+
+ return 0;
+}
diff --git a/t/genzipf.c b/t/genzipf.c
index 4fc10ae7..cd62e584 100644
--- a/t/genzipf.c
+++ b/t/genzipf.c
@@ -297,11 +297,11 @@ int main(int argc, char *argv[])
nranges /= block_size;
if (dist_type == TYPE_ZIPF)
- zipf_init(&zs, nranges, dist_val, 1);
+ zipf_init(&zs, nranges, dist_val, -1, 1);
else if (dist_type == TYPE_PARETO)
- pareto_init(&zs, nranges, dist_val, 1);
+ pareto_init(&zs, nranges, dist_val, -1, 1);
else
- gauss_init(&gs, nranges, dist_val, 1);
+ gauss_init(&gs, nranges, dist_val, -1, 1);
hash_bits = 0;
hash_size = nranges;
diff --git a/t/io_uring.c b/t/io_uring.c
index 044f9195..ff4c7a7c 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -233,8 +233,7 @@ static int prep_more_ios(struct submitter *s, int max_ios)
next_tail = tail = *ring->tail;
do {
next_tail++;
- read_barrier();
- if (next_tail == *ring->head)
+ if (next_tail == atomic_load_acquire(ring->head))
break;
index = tail & sq_ring_mask;
@@ -244,10 +243,8 @@ static int prep_more_ios(struct submitter *s, int max_ios)
tail = next_tail;
} while (prepped < max_ios);
- if (*ring->tail != tail) {
- *ring->tail = tail;
- write_barrier();
- }
+ if (prepped)
+ atomic_store_release(ring->tail, tail);
return prepped;
}
@@ -284,7 +281,7 @@ static int reap_events(struct submitter *s)
struct file *f;
read_barrier();
- if (head == *ring->tail)
+ if (head == atomic_load_acquire(ring->tail))
break;
cqe = &ring->cqes[head & cq_ring_mask];
if (!do_nop) {
@@ -301,9 +298,10 @@ static int reap_events(struct submitter *s)
head++;
} while (1);
- s->inflight -= reaped;
- *ring->head = head;
- write_barrier();
+ if (reaped) {
+ s->inflight -= reaped;
+ atomic_store_release(ring->head, head);
+ }
return reaped;
}
@@ -320,6 +318,7 @@ static void *submitter_fn(void *data)
prepped = 0;
do {
int to_wait, to_submit, this_reap, to_prep;
+ unsigned ring_flags = 0;
if (!prepped && s->inflight < depth) {
to_prep = min(depth - s->inflight, batch_submit);
@@ -338,15 +337,20 @@ submit:
* Only need to call io_uring_enter if we're not using SQ thread
* poll, or if IORING_SQ_NEED_WAKEUP is set.
*/
- if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
+ if (sq_thread_poll)
+ ring_flags = atomic_load_acquire(ring->flags);
+ if (!sq_thread_poll || ring_flags & IORING_SQ_NEED_WAKEUP) {
unsigned flags = 0;
if (to_wait)
flags = IORING_ENTER_GETEVENTS;
- if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
+ if (ring_flags & IORING_SQ_NEED_WAKEUP)
flags |= IORING_ENTER_SQ_WAKEUP;
ret = io_uring_enter(s, to_submit, to_wait, flags);
s->calls++;
+ } else {
+ /* for SQPOLL, we submitted it all effectively */
+ ret = to_submit;
}
/*
diff --git a/t/latency_percentiles.py b/t/latency_percentiles.py
index 6ce4579a..cc437426 100755
--- a/t/latency_percentiles.py
+++ b/t/latency_percentiles.py
@@ -216,7 +216,7 @@ class FioLatTest():
file_data = file.read()
#
- # Read the first few lines and see if any of them begin with '3;fio-'
+ # Read the first few lines and see if any of them begin with '3;'
# If so, the line is probably terse output. Obviously, this only
# works for fio terse version 3 and it does not work for
# multi-line terse output
@@ -224,7 +224,7 @@ class FioLatTest():
lines = file_data.splitlines()
for i in range(8):
file_data = lines[i]
- if file_data.startswith('3;fio-'):
+ if file_data.startswith('3;'):
self.terse_data = file_data.split(';')
return True
diff --git a/t/memlock.c b/t/memlock.c
index 418dc3c4..9f5a3ea8 100644
--- a/t/memlock.c
+++ b/t/memlock.c
@@ -22,7 +22,7 @@ static void *worker(void *data)
for (index = 0; index + 4096 < size; index += 4096)
memset(&buf[index+512], 0x89, 512);
if (first) {
- printf("loop%d: did %lu MiB\n", i+1, size/(1024UL*1024UL));
+ printf("loop%d: did %lu MiB\n", i+1, td->mib);
first = 0;
}
}
diff --git a/t/run-fio-tests.py b/t/run-fio-tests.py
index e5c2f17c..a59cdfe0 100755
--- a/t/run-fio-tests.py
+++ b/t/run-fio-tests.py
@@ -879,8 +879,8 @@ TEST_LIST = [
{
'test_id': 1007,
'test_class': FioExeTest,
- 'exe': 't/zbd/run-tests-against-regular-nullb',
- 'parameters': None,
+ 'exe': 't/zbd/run-tests-against-nullb',
+ 'parameters': ['-s', '1'],
'success': SUCCESS_DEFAULT,
'requirements': [Requirements.linux, Requirements.zbd,
Requirements.root],
@@ -888,8 +888,8 @@ TEST_LIST = [
{
'test_id': 1008,
'test_class': FioExeTest,
- 'exe': 't/zbd/run-tests-against-zoned-nullb',
- 'parameters': None,
+ 'exe': 't/zbd/run-tests-against-nullb',
+ 'parameters': ['-s', '2'],
'success': SUCCESS_DEFAULT,
'requirements': [Requirements.linux, Requirements.zbd,
Requirements.root, Requirements.zoned_nullb],
diff --git a/t/zbd/functions b/t/zbd/functions
index 1a64a215..08a2c629 100644
--- a/t/zbd/functions
+++ b/t/zbd/functions
@@ -71,7 +71,7 @@ first_sequential_zone() {
if [ -n "${blkzone}" ] && [ ! -n "${use_libzbc}" ]; then
${blkzone} report "$dev" |
- sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*type:[[:blank:]]2(.*/\1 \2/p' |
+ sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*zcond:\(14\|[[:blank:]][0-4]\)(.*type:[[:blank:]]\([2]\)(.*/\1 \2/p' |
{
read -r starting_sector length &&
# Convert from hex to decimal
@@ -79,7 +79,7 @@ first_sequential_zone() {
}
else
${zbc_report_zones} "$dev" |
- sed -n 's/^Zone [0-9]*: type 0x2 .*, sector \([0-9]*\), \([0-9]*\) sectors,.*$/\1 \2/p' |
+ sed -n 's/^Zone [0-9]*: type 0x2 .*,[[:blank:]]cond[[:blank:]]0x[0-4e][[:blank:]].*, sector \([0-9]*\), \([0-9]*\) sectors.*$/\1 \2/p' |
head -n1
fi
}
@@ -121,15 +121,75 @@ total_zone_capacity() {
echo $((capacity * 512))
}
+# Reports the starting sector and length of the first zone of device $1
+# that is not in offline (or similar) condition.
+first_online_zone() {
+ local dev=$1
+
+ if [ -z "$is_zbd" ]; then
+ echo 0
+ return
+ fi
+
+ if [ -n "${blkzone}" ] && [ ! -n "${use_libzbc}" ]; then
+ ${blkzone} report "$dev" |
+ sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*zcond:\(14\|[[:blank:]][0-4]\)(.*type:[[:blank:]][12](.*/\1/p' |
+ head -n1 |
+ {
+ read -r starting_sector &&
+ # Convert from hex to decimal
+ echo $((starting_sector))
+ }
+ else
+ ${zbc_report_zones} "$dev" |
+ sed -n 's/^Zone[[:blank:]][0-9]*:[[:blank:]]type[[:blank:]]0x[12][[:blank:]].*,[[:blank:]]cond[[:blank:]]0x[0-4e][[:blank:]].*,[[:blank:]]sector[[:blank:]]\([0-9]*\),.*$/\1/p' |
+ head -n1
+ fi
+}
+
+# Reports the starting sector and length of the last zone of device $1
+# that is not in offline (or similar) condition.
+last_online_zone() {
+ local dev=$1
+
+ if [ -z "$is_zbd" ]; then
+ echo 0
+ return
+ fi
+
+ if [ -n "${blkzone}" ] && [ ! -n "${use_libzbc}" ]; then
+ ${blkzone} report "$dev" |
+ sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*zcond:\(14\|[[:blank:]][0-4]\)(.*type:[[:blank:]][12](.*/\1/p' |
+ tail -1 |
+ {
+ read -r starting_sector &&
+ # Convert from hex to decimal
+ echo $((starting_sector))
+ }
+ else
+ ${zbc_report_zones} "$dev" |
+ sed -n 's/^Zone[[:blank:]][0-9]*:[[:blank:]]type[[:blank:]]0x[12][[:blank:]].*,[[:blank:]]cond[[:blank:]]0x[0-4e][[:blank:]].*,[[:blank:]]sector[[:blank:]]\([0-9]*\),.*$/\1/p' |
+ tail -1
+ fi
+}
+
+# Get max_open_zones of SMR drives using sg_inq or libzbc tools. Two test cases
+# 31 and 32 use this max_open_zones value. The test case 31 uses max_open_zones
+# to decide number of write target zones. The test case 32 passes max_open_zones
+# value to fio with --max_open_zones option. Of note is that fio itself has the
+# feature to get max_open_zones from the device through sysfs or ioengine
+# specific implementation. This max_open_zones fetch by test script is required
+# in case fio is running on an old Linux kernel version which lacks
+# max_open_zones in sysfs, or which lacks zoned block device support completely.
max_open_zones() {
local dev=$1
if [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then
if ! ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" \
> /dev/null 2>&1; then
- # Non scsi device such as null_blk can not return max open zones.
- # Use default value.
- echo 128
+ # When sg_inq can not get max open zones, specify 0 which indicates
+ # fio to get max open zones limit from the device.
+ echo 0
else
${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" | tail -1 |
{
diff --git a/t/zbd/run-tests-against-nullb b/t/zbd/run-tests-against-nullb
new file mode 100755
index 00000000..db901179
--- /dev/null
+++ b/t/zbd/run-tests-against-nullb
@@ -0,0 +1,354 @@
+#!/bin/bash
+#
+# Copyright (C) 2020 Western Digital Corporation or its affiliates.
+#
+# This file is released under the GPL.
+#
+# Run t/zbd/test-zbd-support script against a variety of conventional,
+# zoned and mixed zone configurations.
+#
+
+usage()
+{
+ echo "This script runs the tests from t/zbd/test-zbd-support script"
+ echo "against a nullb device in a variety of conventional and zoned"
+ echo "configurations."
+ echo "Usage: ${0} [OPTIONS]"
+ echo "Options:"
+ echo -e "\t-h Show this message."
+ echo -e "\t-L List the device layouts for every section without running"
+ echo -e "\t tests."
+ echo -e "\t-s <#section> Only run the section with the given number."
+ echo -e "\t-l Use libzbc ioengine to run the tests."
+ echo -e "\t-t <#test> Only run the test with the given number in every section."
+ echo -e "\t-o <max_open_zones> Specify MaxOpen value, (${set_max_open} by default)."
+ echo -e "\t-n <#number of runs> Set the number of times to run the entire suite "
+ echo -e "\t or an individual section/test."
+ echo -e "\t-q Quit t/zbd/test-zbd-support run after any failed test."
+ echo -e "\t-r Remove the /dev/nullb0 device that may still exist after"
+ echo -e "\t running this script."
+ exit 1
+}
+
+cleanup_nullb()
+{
+ for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
+ modprobe -r null_blk
+ modprobe null_blk nr_devices=0 || exit $?
+ for d in /sys/kernel/config/nullb/*; do
+ [ -d "$d" ] && rmdir "$d"
+ done
+ modprobe -r null_blk
+ [ -e /sys/module/null_blk ] && exit $?
+}
+
+create_nullb()
+{
+ modprobe null_blk nr_devices=0 &&
+ cd /sys/kernel/config/nullb &&
+ mkdir nullb0 &&
+ cd nullb0 || return $?
+}
+
+configure_nullb()
+{
+ echo 0 > completion_nsec &&
+ echo ${dev_blocksize} > blocksize &&
+ echo ${dev_size} > size &&
+ echo 1 > memory_backed || return $?
+
+ if ((conv_pcnt < 100)); then
+ echo 1 > zoned &&
+ echo "${zone_size}" > zone_size || return $?
+
+ if ((zone_capacity < zone_size)); then
+ if ((!zcap_supported)); then
+ echo "null_blk does not support zone capacity"
+ return 2
+ fi
+ echo "${zone_capacity}" > zone_capacity
+ fi
+ if ((conv_pcnt)); then
+ if ((!conv_supported)); then
+ echo "null_blk does not support conventional zones"
+ return 2
+ fi
+ nr_conv=$((dev_size/zone_size*conv_pcnt/100))
+ echo "${nr_conv}" > zone_nr_conv
+ fi
+ fi
+
+ echo 1 > power || return $?
+ return 0
+}
+
+show_nullb_config()
+{
+ if ((conv_pcnt < 100)); then
+ echo " $(printf "Zoned Device, %d%% Conventional Zones (%d)" \
+ ${conv_pcnt} ${nr_conv})"
+ echo " $(printf "Zone Size: %d MB" ${zone_size})"
+ echo " $(printf "Zone Capacity: %d MB" ${zone_capacity})"
+ if ((max_open)); then
+ echo " $(printf "Max Open: %d Zones" ${max_open})"
+ else
+ echo " Max Open: Unlimited Zones"
+ fi
+ else
+ echo " Non-zoned Device"
+ fi
+}
+
+#
+# Test sections.
+#
+# Fully conventional device.
+section1()
+{
+ conv_pcnt=100
+ max_open=0
+}
+
+# Zoned device with no conventional zones, ZCAP == ZSIZE, unlimited MaxOpen.
+section2()
+{
+ conv_pcnt=0
+ zone_size=1
+ zone_capacity=1
+ max_open=0
+}
+
+# Zoned device with no conventional zones, ZCAP < ZSIZE, unlimited MaxOpen.
+section3()
+{
+ conv_pcnt=0
+ zone_size=4
+ zone_capacity=3
+ max_open=0
+}
+
+# Zoned device with mostly sequential zones, ZCAP == ZSIZE, unlimited MaxOpen.
+section4()
+{
+ conv_pcnt=10
+ zone_size=1
+ zone_capacity=1
+ max_open=0
+}
+
+# Zoned device with mostly sequential zones, ZCAP < ZSIZE, unlimited MaxOpen.
+section5()
+{
+ conv_pcnt=10
+ zone_size=4
+ zone_capacity=3
+ max_open=0
+}
+
+# Zoned device with mostly conventional zones, ZCAP == ZSIZE, unlimited MaxOpen.
+section6()
+{
+ conv_pcnt=66
+ zone_size=1
+ zone_capacity=1
+ max_open=0
+}
+
+# Zoned device with mostly conventional zones, ZCAP < ZSIZE, unlimited MaxOpen.
+section7()
+{
+ dev_size=2048
+ conv_pcnt=66
+ zone_size=4
+ zone_capacity=3
+ max_open=0
+}
+
+# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen.
+section8()
+{
+ dev_size=1024
+ conv_pcnt=0
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+}
+
+# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen.
+section9()
+{
+ conv_pcnt=0
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+}
+
+# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen.
+section10()
+{
+ conv_pcnt=10
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+}
+
+# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen.
+section11()
+{
+ conv_pcnt=10
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+}
+
+# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen.
+section12()
+{
+ conv_pcnt=66
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+}
+
+# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen.
+section13()
+{
+ dev_size=2048
+ conv_pcnt=66
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+}
+
+#
+# Entry point.
+#
+SECONDS=0
+scriptdir="$(cd "$(dirname "$0")" && pwd)"
+sections=()
+zcap_supported=1
+conv_supported=1
+list_only=0
+dev_size=1024
+dev_blocksize=4096
+set_max_open=8
+zbd_test_opts=()
+libzbc=0
+num_of_runs=1
+test_case=0
+quit_on_err=0
+
+while (($#)); do
+ case "$1" in
+ -s) sections+=("$2"); shift; shift;;
+ -o) set_max_open="${2}"; shift; shift;;
+ -L) list_only=1; shift;;
+ -r) cleanup_nullb; exit 0;;
+ -l) libzbc=1; shift;;
+ -n) num_of_runs="${2}"; shift; shift;;
+ -t) test_case="${2}"; shift; shift;;
+ -q) quit_on_err=1; shift;;
+ -h) usage; break;;
+ --) shift; break;;
+ *) usage; exit 1;;
+ esac
+done
+
+if [ "${#sections[@]}" = 0 ]; then
+ readarray -t sections < <(declare -F | grep "section[0-9]*" | tr -c -d "[:digit:]\n" | sort -n)
+fi
+
+cleanup_nullb
+
+#
+# Test creating null_blk device and check if newer features are supported
+#
+if ! eval "create_nullb"; then
+ echo "can't create nullb"
+ exit 1
+fi
+if ! cat /sys/kernel/config/nullb/features | grep -q zone_capacity; then
+ zcap_supported=0
+fi
+if ! cat /sys/kernel/config/nullb/features | grep -q zone_nr_conv; then
+ conv_supported=0
+fi
+
+rc=0
+test_rc=0
+intr=0
+run_nr=1
+trap 'kill ${zbd_test_pid}; intr=1' SIGINT
+
+while ((run_nr <= $num_of_runs)); do
+ echo -e "\nRun #$run_nr:"
+ for section_number in "${sections[@]}"; do
+ cleanup_nullb
+ echo "---------- Section $(printf "%02d" $section_number) ----------"
+ if ! eval "create_nullb"; then
+ echo "error creating nullb"
+ exit 1
+ fi
+ zbd_test_opts=()
+ if ((test_case)); then
+ zbd_test_opts+=("-t" "${test_case}")
+ fi
+ if ((quit_on_err)); then
+ zbd_test_opts+=("-q")
+ fi
+ section$section_number
+ configure_nullb
+ rc=$?
+ ((rc == 2)) && continue
+ if ((rc)); then
+ echo "can't set up nullb for section $(printf "%02d" $section_number)"
+ exit 1
+ fi
+ show_nullb_config
+ if ((libzbc)); then
+ if ((zone_capacity < zone_size)); then
+ echo "libzbc doesn't support zone capacity, skipping section $(printf "%02d" $section_number)"
+ continue
+ fi
+ if ((conv_pcnt == 100)); then
+ echo "libzbc only supports zoned devices, skipping section $(printf "%02d" $section_number)"
+ continue
+ fi
+ zbd_test_opts+=("-l")
+ fi
+ cd "${scriptdir}"
+ ((intr)) && exit 1
+ ((list_only)) && continue
+
+ ./test-zbd-support ${zbd_test_opts[@]} /dev/nullb0 &
+ zbd_test_pid=$!
+ if kill -0 "${zbd_test_pid}"; then
+ wait "${zbd_test_pid}"
+ test_rc=$?
+ else
+ echo "can't run ZBD tests"
+ exit 1
+ fi
+ ((intr)) && exit 1
+ if (($test_rc)); then
+ rc=1
+ ((quit_on_err)) && break
+ fi
+ done
+
+ ((rc && quit_on_err)) && break
+ run_nr=$((run_nr + 1))
+done
+
+if ((!list_only)); then
+ echo "--------------------------------"
+ echo "Total run time: $(TZ=UTC0 printf "%(%H:%M:%S)T\n" $(( SECONDS )) )"
+fi
+
+exit $rc
diff --git a/t/zbd/run-tests-against-regular-nullb b/t/zbd/run-tests-against-regular-nullb
deleted file mode 100755
index 5b7b4009..00000000
--- a/t/zbd/run-tests-against-regular-nullb
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-#
-# Copyright (C) 2018 Western Digital Corporation or its affiliates.
-#
-# This file is released under the GPL.
-
-scriptdir="$(cd "$(dirname "$0")" && pwd)"
-
-for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
-modprobe -r null_blk
-modprobe null_blk nr_devices=0 || exit $?
-for d in /sys/kernel/config/nullb/*; do
- [ -d "$d" ] && rmdir "$d"
-done
-modprobe -r null_blk
-[ -e /sys/module/null_blk ] && exit $?
-modprobe null_blk nr_devices=0 &&
- cd /sys/kernel/config/nullb &&
- mkdir nullb0 &&
- cd nullb0 &&
- echo 0 > completion_nsec &&
- echo 4096 > blocksize &&
- echo 1024 > size &&
- echo 1 > memory_backed &&
- echo 1 > power || exit $?
-
-"${scriptdir}"/test-zbd-support "$@" /dev/nullb0
diff --git a/t/zbd/run-tests-against-zoned-nullb b/t/zbd/run-tests-against-zoned-nullb
deleted file mode 100755
index f9c9530c..00000000
--- a/t/zbd/run-tests-against-zoned-nullb
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-#
-# Copyright (C) 2018 Western Digital Corporation or its affiliates.
-#
-# This file is released under the GPL.
-
-scriptdir="$(cd "$(dirname "$0")" && pwd)"
-
-zone_size=1
-zone_capacity=1
-if [[ ${1} == "-h" ]]; then
- echo "Usage: ${0} [OPTIONS]"
- echo "Options:"
- echo -e "\t-h Show this message."
- echo -e "\t-zone-cap Use null blk with zone capacity less than zone size."
- echo -e "\tany option supported by test-zbd-support script."
- exit 1
-elif [[ ${1} == "-zone-cap" ]]; then
- zone_size=4
- zone_capacity=3
- shift
-fi
-
-for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
-modprobe -r null_blk
-modprobe null_blk nr_devices=0 || exit $?
-for d in /sys/kernel/config/nullb/*; do
- [ -d "$d" ] && rmdir "$d"
-done
-modprobe -r null_blk
-[ -e /sys/module/null_blk ] && exit $?
-modprobe null_blk nr_devices=0 &&
- cd /sys/kernel/config/nullb &&
- mkdir nullb0 &&
- cd nullb0 || exit $?
-
-if ((zone_capacity < zone_size)); then
- if [[ ! -w zone_capacity ]]; then
- echo "null blk does not support zone capacity"
- exit 1
- fi
- echo "${zone_capacity}" > zone_capacity
-fi
-
-echo 1 > zoned &&
- echo "${zone_size}" > zone_size &&
- echo 0 > completion_nsec &&
- echo 4096 > blocksize &&
- echo 1024 > size &&
- echo 1 > memory_backed &&
- echo 1 > power || exit $?
-
-"${scriptdir}"/test-zbd-support "$@" /dev/nullb0
diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support
index 248423bb..57e6d05e 100755
--- a/t/zbd/test-zbd-support
+++ b/t/zbd/test-zbd-support
@@ -14,6 +14,7 @@ usage() {
echo -e "\t-r Reset all zones before test start"
echo -e "\t-o <max_open_zones> Run fio with max_open_zones limit"
echo -e "\t-t <test #> Run only a single test case with specified number"
+ echo -e "\t-q Quit the test run after any failed test"
echo -e "\t-z Run fio with debug=zbd option"
}
@@ -190,6 +191,64 @@ prep_write() {
reset_zone "${dev}" -1
}
+SKIP_TESTCASE=255
+
+require_scsi_dev() {
+ if ! is_scsi_device "$dev"; then
+ SKIP_REASON="$dev is not a SCSI device"
+ return 1
+ fi
+ return 0
+}
+
+require_conv_zone_bytes() {
+ local req_bytes=${1}
+
+ if ((req_bytes > first_sequential_zone_sector * 512)); then
+ SKIP_REASON="$dev does not have enough conventional zones"
+ return 1
+ fi
+ return 0
+}
+
+require_zbd() {
+ if [[ -z ${is_zbd} ]]; then
+ SKIP_REASON="$dev is not a zoned block device"
+ return 1
+ fi
+ return 0
+}
+
+require_regular_block_dev() {
+ if [[ -n ${is_zbd} ]]; then
+ SKIP_REASON="$dev is not a regular block device"
+ return 1
+ fi
+ return 0
+}
+
+require_seq_zones() {
+ local req_seq_zones=${1}
+ local seq_bytes=$((disk_size - first_sequential_zone_sector * 512))
+
+ if ((req_seq_zones > seq_bytes / zone_size)); then
+ SKIP_REASON="$dev does not have $req_seq_zones sequential zones"
+ return 1
+ fi
+ return 0
+}
+
+require_conv_zones() {
+ local req_c_zones=${1}
+ local conv_bytes=$((first_sequential_zone_sector * 512))
+
+ if ((req_c_zones > conv_bytes / zone_size)); then
+ SKIP_REASON="$dev does not have $req_c_zones conventional zones"
+ return 1
+ fi
+ return 0
+}
+
# Check whether buffered writes are refused.
test1() {
run_fio --name=job1 --filename="$dev" --rw=write --direct=0 --bs=4K \
@@ -221,14 +280,15 @@ test2() {
if [ -z "$is_zbd" ]; then
opts+=("--zonesize=${zone_size}")
fi
- run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
- ! grep -q 'WRITE:' "${logfile}.${test_number}"
+ run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 && return 1
+ grep -q 'buflen exceeds zone size' "${logfile}.${test_number}"
}
# Run fio against an empty zone. This causes fio to report "No I/O performed".
test3() {
local off opts=() rc
+ require_seq_zones 129 || return $SKIP_TESTCASE
off=$((first_sequential_zone_sector * 512 + 128 * zone_size))
size=$((zone_size))
[ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
@@ -246,6 +306,7 @@ test3() {
test4() {
local off opts=()
+ require_seq_zones 130 || return $SKIP_TESTCASE
off=$((first_sequential_zone_sector * 512 + 129 * zone_size))
size=$((zone_size))
[ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
@@ -327,10 +388,7 @@ test8() {
test9() {
local size
- if ! is_scsi_device "$dev"; then
- echo "$dev is not a SCSI device" >>"${logfile}.${test_number}"
- return 0
- fi
+ require_scsi_dev || return $SKIP_TESTCASE
prep_write
size=$((4 * zone_size))
@@ -346,10 +404,7 @@ test9() {
test10() {
local size
- if ! is_scsi_device "$dev"; then
- echo "$dev is not a SCSI device" >>"${logfile}.${test_number}"
- return 0
- fi
+ require_scsi_dev || return $SKIP_TESTCASE
prep_write
size=$((4 * zone_size))
@@ -409,18 +464,20 @@ test13() {
# Random write to conventional zones.
test14() {
- local size
+ local off size
+ if ! result=($(first_online_zone "$dev")); then
+ echo "Failed to determine first online zone"
+ exit 1
+ fi
+ off=${result[0]}
prep_write
size=$((16 * 2**20)) # 20 MB
- if [ $size -gt $((first_sequential_zone_sector * 512)) ]; then
- echo "$dev does not have enough sequential zones" \
- >>"${logfile}.${test_number}"
- return 0
- fi
+ require_conv_zone_bytes "${size}" || return $SKIP_TESTCASE
+
run_one_fio_job "$(ioengine "libaio")" --iodepth=64 --rw=randwrite --bs=16K \
--zonemode=zbd --zonesize="${zone_size}" --do_verify=1 \
- --verify=md5 --size=$size \
+ --verify=md5 --offset=$off --size=$size\
>>"${logfile}.${test_number}" 2>&1 || return $?
check_written $((size)) || return $?
check_read $((size)) || return $?
@@ -477,17 +534,26 @@ test16() {
# Random reads and writes in the last zone.
test17() {
- local io off read size written
+ local io off last read size written
off=$(((disk_size / zone_size - 1) * zone_size))
size=$((disk_size - off))
+ if ! last=($(last_online_zone "$dev")); then
+ echo "Failed to determine last online zone"
+ exit 1
+ fi
+ if [[ "$((last * 512))" -lt "$off" ]]; then
+ off=$((last * 512))
+ size=$zone_size
+ fi
if [ -n "$is_zbd" ]; then
reset_zone "$dev" $((off / 512)) || return $?
fi
prep_write
run_one_fio_job "$(ioengine "libaio")" --iodepth=8 --rw=randrw --bs=4K \
--zonemode=zbd --zonesize="${zone_size}" \
- --offset=$off --loops=2 --norandommap=1\
+ --offset=$off --loops=2 --norandommap=1 \
+ --size="$size"\
>>"${logfile}.${test_number}" 2>&1 || return $?
written=$(fio_written <"${logfile}.${test_number}")
read=$(fio_read <"${logfile}.${test_number}")
@@ -604,6 +670,7 @@ test27() {
test28() {
local i jobs=16 off opts
+ require_seq_zones 65 || return $SKIP_TESTCASE
off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
[ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
prep_write
@@ -628,6 +695,7 @@ test28() {
test29() {
local i jobs=16 off opts=()
+ require_seq_zones 80 || return $SKIP_TESTCASE
off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
size=$((16*zone_size))
prep_write
@@ -663,26 +731,28 @@ test30() {
test31() {
local bs inc nz off opts size
- prep_write
- # Start with writing 128 KB to 128 sequential zones.
- bs=128K
- nz=128
- # shellcheck disable=SC2017
- inc=$(((disk_size - (first_sequential_zone_sector * 512)) / (nz * zone_size)
- * zone_size))
- opts=()
- for ((off = first_sequential_zone_sector * 512; off < disk_size;
- off += inc)); do
- opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--io_size=$bs")
- opts+=("--bs=$bs" "--size=$zone_size" "$(ioengine "libaio")")
- opts+=("--rw=write" "--direct=1" "--thread=1" "--stats=0")
- opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
- opts+=(${job_var_opts[@]})
- done
- "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
- # Next, run the test.
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+ # As preparation, write 128 KB to sequential write required zones. Limit
+ # write target zones up to max_open_zones to keep test time reasonable.
+ # To distribute the write target zones evenly, skip certain zones for every
+ # write. Utilize zonemode strided for such write patterns.
+ bs=$((128 * 1024))
+ nz=$((max_open_zones))
+ if [[ $nz -eq 0 ]]; then
+ nz=128
+ fi
off=$((first_sequential_zone_sector * 512))
size=$((disk_size - off))
+ inc=$(((size / nz / zone_size) * zone_size))
+ opts=("--name=$dev" "--filename=$dev" "--rw=write" "--bs=${bs}")
+ opts+=("--offset=$off" "--size=$((inc * nz))" "--io_size=$((bs * nz))")
+ opts+=("--zonemode=strided" "--zonesize=${bs}" "--zonerange=${inc}")
+ opts+=("--direct=1")
+ echo "fio ${opts[@]}" >> "${logfile}.${test_number}"
+ "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
+
+ # Next, run the test.
opts=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
opts+=("--bs=$bs" "$(ioengine "psync")" "--rw=randread" "--direct=1")
opts+=("--thread=1" "--time_based" "--runtime=30" "--zonemode=zbd")
@@ -696,6 +766,8 @@ test31() {
test32() {
local off opts=() size
+ require_zbd || return $SKIP_TESTCASE
+
prep_write
off=$((first_sequential_zone_sector * 512))
size=$((disk_size - off))
@@ -773,7 +845,7 @@ test37() {
local bs off size capacity
prep_write
- capacity=$(total_zone_capacity 1 $first_sequential_zone_sector $dev)
+ capacity=$(total_zone_capacity 1 $((first_sequential_zone_sector*512)) $dev)
if [ "$first_sequential_zone_sector" = 0 ]; then
off=0
else
@@ -805,16 +877,23 @@ test38() {
# Read one block from a block device.
read_one_block() {
+ local off
local bs
+ if ! result=($(first_online_zone "$dev")); then
+ echo "Failed to determine first online zone"
+ exit 1
+ fi
+ off=${result[0]}
bs=$((logical_block_size))
- run_one_fio_job --rw=read "$(ioengine "psync")" --bs=$bs --size=$bs "$@" 2>&1 |
+ run_one_fio_job --rw=read "$(ioengine "psync")" --offset=$off --bs=$bs \
+ --size=$bs "$@" 2>&1 |
tee -a "${logfile}.${test_number}"
}
# Check whether fio accepts --zonemode=none for zoned block devices.
test39() {
- [ -n "$is_zbd" ] || return 0
+ require_zbd || return $SKIP_TESTCASE
read_one_block --zonemode=none >/dev/null || return $?
check_read $((logical_block_size)) || return $?
}
@@ -824,7 +903,7 @@ test40() {
local bs
bs=$((logical_block_size))
- [ -n "$is_zbd" ] || return 0
+ require_zbd || return $SKIP_TESTCASE
read_one_block --zonemode=strided |
grep -q 'fio: --zonesize must be specified when using --zonemode=strided' ||
return $?
@@ -834,21 +913,21 @@ test40() {
# Check whether fio checks the zone size for zoned block devices.
test41() {
- [ -n "$is_zbd" ] || return 0
+ require_zbd || return $SKIP_TESTCASE
read_one_block --zonemode=zbd --zonesize=$((2 * zone_size)) |
grep -q 'job parameter zonesize.*does not match disk zone size'
}
# Check whether fio handles --zonesize=0 correctly for regular block devices.
test42() {
- [ -n "$is_zbd" ] && return 0
+ require_regular_block_dev || return $SKIP_TESTCASE
read_one_block --zonemode=zbd --zonesize=0 |
- grep -q 'Specifying the zone size is mandatory for regular block devices with --zonemode=zbd'
+ grep -q 'Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd'
}
# Check whether fio handles --zonesize=1 correctly for regular block devices.
test43() {
- [ -n "$is_zbd" ] && return 0
+ require_regular_block_dev || return $SKIP_TESTCASE
read_one_block --zonemode=zbd --zonesize=1 |
grep -q 'zone size must be at least 512 bytes for --zonemode=zbd'
}
@@ -862,7 +941,7 @@ test44() {
test45() {
local bs i
- [ -z "$is_zbd" ] && return 0
+ require_zbd || return $SKIP_TESTCASE
prep_write
bs=$((logical_block_size))
run_one_fio_job "$(ioengine "psync")" --iodepth=1 --rw=randwrite --bs=$bs\
@@ -901,6 +980,9 @@ test47() {
test48() {
local i jobs=16 off opts=()
+ require_zbd || return $SKIP_TESTCASE
+ require_seq_zones 80 || return $SKIP_TESTCASE
+
off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
size=$((16*zone_size))
prep_write
@@ -913,7 +995,7 @@ test48() {
for ((i=0;i<jobs;i++)); do
opts+=("--name=job$i" "--filename=$dev" "--offset=$off" "--bs=16K")
opts+=("--io_size=$zone_size" "--iodepth=256" "--thread=1")
- opts+=("--group_reporting=1")
+ opts+=("--size=$size" "--group_reporting=1")
# max_open_zones is already specified
opts+=($(job_var_opts_exclude "--max_open_zones"))
done
@@ -922,7 +1004,7 @@ test48() {
{ echo; echo "fio ${opts[*]}"; echo; } >>"${logfile}.${test_number}"
- timeout -v -s KILL 45s \
+ timeout -v -s KILL 180s \
"${dynamic_analyzer[@]}" "$fio" "${opts[@]}" \
>> "${logfile}.${test_number}" 2>&1 || return $?
}
@@ -930,11 +1012,7 @@ test48() {
# Check if fio handles --zonecapacity on a normal block device correctly
test49() {
- if [ -n "$is_zbd" ]; then
- echo "$dev is not a regular block device" \
- >>"${logfile}.${test_number}"
- return 0
- fi
+ require_regular_block_dev || return $SKIP_TESTCASE
size=$((2 * zone_size))
capacity=$((zone_size * 3 / 4))
@@ -948,12 +1026,203 @@ test49() {
check_read $((capacity * 2)) || return $?
}
+# Verify that conv zones are not locked and only seq zones are locked during
+# random read on conv-seq mixed zones.
+test50() {
+ local off
+
+ require_zbd || return $SKIP_TESTCASE
+ require_conv_zones 8 || return $SKIP_TESTCASE
+ require_seq_zones 8 || return $SKIP_TESTCASE
+
+ reset_zone "${dev}" -1
+
+ off=$((first_sequential_zone_sector * 512 - 8 * zone_size))
+ run_fio --name=job --filename=${dev} --offset=${off} --bs=64K \
+ --size=$((16 * zone_size)) "$(ioengine "libaio")" --rw=randread\
+ --time_based --runtime=3 --zonemode=zbd --zonesize=${zone_size}\
+ --direct=1 --group_reporting=1 ${job_var_opts[@]} \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Verify that conv zones are neither locked nor opened during random write on
+# conv-seq mixed zones. Zone lock and zone open shall happen only on seq zones.
+test51() {
+ local off jobs=16
+ local -a opts
+
+ require_zbd || return $SKIP_TESTCASE
+ require_conv_zones 8 || return $SKIP_TESTCASE
+ require_seq_zones 8 || return $SKIP_TESTCASE
+
+ prep_write
+
+ off=$((first_sequential_zone_sector * 512 - 8 * zone_size))
+ opts+=("--size=$((16 * zone_size))" "$(ioengine "libaio")")
+ opts+=("--zonemode=zbd" "--direct=1" "--zonesize=${zone_size}")
+ opts+=("--max_open_zones=2" "--offset=$off")
+ opts+=("--thread=1" "--group_reporting=1")
+ opts+=("--time_based" "--runtime=30" "--rw=randwrite")
+ for ((i=0;i<jobs;i++)); do
+ opts+=("--name=job${i}" "--filename=$dev")
+ opts+=("--bs=$(((i+1)*16))K")
+ opts+=($(job_var_opts_exclude "--max_open_zones"))
+ done
+ run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Verify that zone_reset_threshold only takes logical blocks from seq
+# zones into account, and logical blocks of conv zones are not counted.
+test52() {
+ local off io_size
+
+ require_zbd || return $SKIP_TESTCASE
+ require_conv_zones 8 || return $SKIP_TESTCASE
+ require_seq_zones 8 || return $SKIP_TESTCASE
+
+ reset_zone "${dev}" -1
+
+ # Total I/O size is 1/8 = 0.125 of the I/O range of cont + seq zones.
+ # Set zone_reset_threshold as 0.1. The threshold size is less than
+ # 0.125, then, reset count zero is expected.
+ # On the other hand, half of the I/O range is covered by conv zones.
+ # If fio would count the conv zones for zone_reset_threshold, the ratio
+ # were more than 0.5 and would trigger zone resets.
+
+ off=$((first_sequential_zone_sector * 512 - 8 * zone_size))
+ io_size=$((zone_size * 16 / 8))
+ run_fio --name=job --filename=$dev --rw=randwrite --bs=$((zone_size/16))\
+ --size=$((zone_size * 16)) --softrandommap=1 \
+ --io_size=$((io_size)) "$(ioengine "psync")" --offset=$off \
+ --zonemode=zbd --direct=1 --zonesize=${zone_size} \
+ --zone_reset_threshold=.1 --zone_reset_frequency=1.0 \
+ ${job_var_opts[@]} --debug=zbd \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+
+ check_written ${io_size} || return $?
+ check_reset_count -eq 0 || return $?
+}
+
+# Check both reads and writes are executed by random I/O to conventional zones.
+test53() {
+ local off capacity io read_b=0 written_b=0
+
+ require_zbd || return $SKIP_TESTCASE
+ require_conv_zones 4 || return $SKIP_TESTCASE
+
+ off=$((first_sequential_zone_sector * 512 - 4 * zone_size))
+ capacity=$(total_zone_capacity 4 $off $dev)
+ run_fio --name=job --filename=${dev} --rw=randrw --bs=64K \
+ --size=$((4 * zone_size)) "$(ioengine "psync")" --offset=${off}\
+ --zonemode=zbd --direct=1 --zonesize=${zone_size} \
+ ${job_var_opts[@]} \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+
+ written_b=$(fio_written <"${logfile}.${test_number}")
+ read_b=$(fio_read <"${logfile}.${test_number}")
+ io=$((written_b + read_b))
+ echo "Number of bytes read: $read_b" >>"${logfile}.${test_number}"
+ echo "Number of bytes written: $written_b" >>"${logfile}.${test_number}"
+ echo "Total number of bytes read and written: $io <> $capacity" \
+ >>"${logfile}.${test_number}"
+ if ((io==capacity && written_b != 0 && read_b != 0)); then
+ return 0
+ fi
+ return 1
+}
+
+# Test read/write mix with verify.
+test54() {
+ require_zbd || return $SKIP_TESTCASE
+ require_seq_zones 8 || return $SKIP_TESTCASE
+
+ run_fio --name=job --filename=${dev} "$(ioengine "libaio")" \
+ --time_based=1 --runtime=30s --continue_on_error=0 \
+ --offset=$((first_sequential_zone_sector * 512)) \
+ --size=$((8*zone_size)) --direct=1 --iodepth=1 \
+ --rw=randrw:2 --rwmixwrite=25 --bsrange=4k-${zone_size} \
+ --zonemode=zbd --zonesize=${zone_size} \
+ --verify=crc32c --do_verify=1 --verify_backlog=2 \
+ --experimental_verify=1 \
+ --alloc-size=65536 --random_generator=tausworthe64 \
+ ${job_var_opts[@]} --debug=zbd \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# test 'z' suffix parsing only
+test55() {
+ local bs
+ bs=$((logical_block_size))
+
+ require_zbd || return $SKIP_TESTCASE
+ # offset=1z + offset_increment=10z + size=2z
+ require_seq_zones 13 || return $SKIP_TESTCASE
+
+ run_fio --name=j \
+ --filename=${dev} \
+ --direct=1 \
+ "$(ioengine "psync")" \
+ --zonemode=zbd \
+ --zonesize=${zone_size} \
+ --rw=write \
+ --bs=${bs} \
+ --numjobs=2 \
+ --offset_increment=10z \
+ --offset=1z \
+ --size=2z \
+ --io_size=3z \
+ ${job_var_opts[@]} --debug=zbd \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# test 'z' suffix parsing only
+test56() {
+ local bs
+ bs=$((logical_block_size))
+
+ require_regular_block_dev || return $SKIP_TESTCASE
+ require_seq_zones 10 || return $SKIP_TESTCASE
+
+ run_fio --name=j \
+ --filename=${dev} \
+ --direct=1 \
+ "$(ioengine "psync")" \
+ --zonemode=strided \
+ --zonesize=${zone_size} \
+ --rw=write \
+ --bs=${bs} \
+ --size=10z \
+ --zoneskip=2z \
+ ${job_var_opts[@]} --debug=zbd \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Test that repeated async write job does not cause zone reset during writes
+# in-flight, when the block size is not a divisor of the zone size.
+test57() {
+ local bs off
+
+ require_zbd || return $SKIP_TESTCASE
+
+ bs=$((4096 * 7))
+ off=$((first_sequential_zone_sector * 512))
+
+ run_fio --name=job --filename="${dev}" --rw=randwrite --bs="${bs}" \
+ --offset="${off}" --size=$((4 * zone_size)) --iodepth=256 \
+ "$(ioengine "libaio")" --time_based=1 --runtime=30s \
+ --zonemode=zbd --direct=1 --zonesize="${zone_size}" \
+ ${job_var_opts[@]} \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+SECONDS=0
tests=()
dynamic_analyzer=()
reset_all_zones=
use_libzbc=
zbd_debug=
max_open_zones_opt=
+quit_on_err=
while [ "${1#-}" != "$1" ]; do
case "$1" in
@@ -968,8 +1237,10 @@ while [ "${1#-}" != "$1" ]; do
-o) max_open_zones_opt="${2}"; shift; shift;;
-v) dynamic_analyzer=(valgrind "--read-var-info=yes");
shift;;
+ -q) quit_on_err=1; shift;;
-z) zbd_debug=1; shift;;
--) shift; break;;
+ *) usage; exit 1;;
esac
done
@@ -1073,6 +1344,7 @@ fi
if [[ -n ${max_open_zones_opt} ]]; then
# Override max_open_zones with the script option value
max_open_zones="${max_open_zones_opt}"
+ global_var_opts+=("--ignore_zone_limits=1")
job_var_opts+=("--max_open_zones=${max_open_zones_opt}")
fi
@@ -1087,10 +1359,12 @@ fi
logfile=$0.log
passed=0
+skipped=0
failed=0
if [ -t 1 ]; then
red="\e[1;31m"
green="\e[1;32m"
+ cyan="\e[1;36m"
end="\e[m"
else
red=""
@@ -1101,14 +1375,23 @@ rc=0
intr=0
trap 'intr=1' SIGINT
+ret=0
for test_number in "${tests[@]}"; do
rm -f "${logfile}.${test_number}"
+ unset SKIP_REASON
echo -n "Running test $(printf "%02d" $test_number) ... "
- if eval "test$test_number" && check_log $test_number; then
+ eval "test$test_number"
+ ret=$?
+ if ((!ret)) && check_log $test_number; then
status="PASS"
cc_status="${green}${status}${end}"
((passed++))
+ elif ((ret==SKIP_TESTCASE)); then
+ status="SKIP"
+ echo "${SKIP_REASON}" >> "${logfile}.${test_number}"
+ cc_status="${cyan}${status}${end} ${SKIP_REASON}"
+ ((skipped++))
else
status="FAIL"
cc_status="${red}${status}${end}"
@@ -1118,10 +1401,15 @@ for test_number in "${tests[@]}"; do
echo -e "$cc_status"
echo "$status" >> "${logfile}.${test_number}"
[ $intr -ne 0 ] && exit 1
+ [ -n "$quit_on_err" -a "$rc" -ne 0 ] && exit 1
done
echo "$passed tests passed"
+if [ $skipped -gt 0 ]; then
+ echo " $skipped tests skipped"
+fi
if [ $failed -gt 0 ]; then
- echo " and $failed tests failed"
+ echo " $failed tests failed"
fi
+echo "Run time: $(TZ=UTC0 printf "%(%H:%M:%S)T\n" $(( SECONDS )) )"
exit $rc
diff --git a/td_error.c b/td_error.c
index 9d58a314..13408f2e 100644
--- a/td_error.c
+++ b/td_error.c
@@ -20,7 +20,7 @@ int td_non_fatal_error(struct thread_data *td, enum error_type_bit etype,
if (!td->o.ignore_error[etype]) {
td->o.ignore_error[etype] = __NON_FATAL_ERR;
- td->o.ignore_error_nr[etype] = ARRAY_SIZE(__NON_FATAL_ERR);
+ td->o.ignore_error_nr[etype] = FIO_ARRAY_SIZE(__NON_FATAL_ERR);
}
if (!(td->o.continue_on_error & (1 << etype)))
diff --git a/thread_options.h b/thread_options.h
index 97c400fe..4b4ecfe1 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -31,6 +31,14 @@ enum fio_memtype {
MEM_CUDA_MALLOC,/* use GPU memory */
};
+/*
+ * What mode to use for deduped data generation
+ */
+enum dedupe_mode {
+ DEDUPE_MODE_REPEAT = 0,
+ DEDUPE_MODE_WORKING_SET = 1,
+};
+
#define ERROR_STR_MAX 128
#define BSSPLIT_MAX 64
@@ -83,13 +91,16 @@ struct thread_options {
unsigned long long size;
unsigned long long io_size;
unsigned int size_percent;
+ unsigned int size_nz;
unsigned int io_size_percent;
+ unsigned int io_size_nz;
unsigned int fill_device;
unsigned int file_append;
unsigned long long file_size_low;
unsigned long long file_size_high;
unsigned long long start_offset;
unsigned long long start_offset_align;
+ unsigned int start_offset_nz;
unsigned long long bs[DDIR_RWDIR_CNT];
unsigned long long ba[DDIR_RWDIR_CNT];
@@ -166,6 +177,7 @@ struct thread_options {
fio_fp64_t zipf_theta;
fio_fp64_t pareto_h;
fio_fp64_t gauss_dev;
+ fio_fp64_t random_center;
unsigned int random_generator;
@@ -176,6 +188,7 @@ struct thread_options {
unsigned int thinktime;
unsigned int thinktime_spin;
unsigned int thinktime_blocks;
+ unsigned int thinktime_blocks_type;
unsigned int fsync_blocks;
unsigned int fdatasync_blocks;
unsigned int barrier_blocks;
@@ -196,12 +209,13 @@ struct thread_options {
unsigned long long zone_size;
unsigned long long zone_capacity;
unsigned long long zone_skip;
+ uint32_t zone_skip_nz;
enum fio_zone_mode zone_mode;
unsigned long long lockmem;
enum fio_memtype mem_type;
unsigned int mem_align;
- unsigned long long max_latency;
+ unsigned long long max_latency[DDIR_RWDIR_CNT];
unsigned int exit_what;
unsigned int stonewall;
@@ -237,6 +251,8 @@ struct thread_options {
unsigned int compress_percentage;
unsigned int compress_chunk;
unsigned int dedupe_percentage;
+ unsigned int dedupe_mode;
+ unsigned int dedupe_working_set_percentage;
unsigned int time_based;
unsigned int disable_lat;
unsigned int disable_clat;
@@ -313,6 +329,7 @@ struct thread_options {
unsigned int gid;
unsigned int offset_increment_percent;
+ unsigned int offset_increment_nz;
unsigned long long offset_increment;
unsigned long long number_ios;
@@ -348,6 +365,7 @@ struct thread_options {
unsigned int read_beyond_wp;
int max_open_zones;
unsigned int job_max_open_zones;
+ unsigned int ignore_zone_limits;
fio_fp64_t zrt;
fio_fp64_t zrf;
};
@@ -382,14 +400,19 @@ struct thread_options_pack {
uint64_t size;
uint64_t io_size;
uint32_t size_percent;
+ uint32_t size_nz;
uint32_t io_size_percent;
+ uint32_t io_size_nz;
uint32_t fill_device;
uint32_t file_append;
uint32_t unique_filename;
+ uint32_t pad3;
uint64_t file_size_low;
uint64_t file_size_high;
uint64_t start_offset;
uint64_t start_offset_align;
+ uint32_t start_offset_nz;
+ uint32_t pad4;
uint64_t bs[DDIR_RWDIR_CNT];
uint64_t ba[DDIR_RWDIR_CNT];
@@ -462,11 +485,10 @@ struct thread_options_pack {
struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
uint32_t zone_split_nr[DDIR_RWDIR_CNT];
- uint8_t pad1[4];
-
fio_fp64_t zipf_theta;
fio_fp64_t pareto_h;
fio_fp64_t gauss_dev;
+ fio_fp64_t random_center;
uint32_t random_generator;
@@ -477,6 +499,7 @@ struct thread_options_pack {
uint32_t thinktime;
uint32_t thinktime_spin;
uint32_t thinktime_blocks;
+ uint32_t thinktime_blocks_type;
uint32_t fsync_blocks;
uint32_t fdatasync_blocks;
uint32_t barrier_blocks;
@@ -497,6 +520,7 @@ struct thread_options_pack {
uint64_t zone_capacity;
uint64_t zone_skip;
uint64_t lockmem;
+ uint32_t zone_skip_nz;
uint32_t mem_type;
uint32_t mem_align;
@@ -504,6 +528,7 @@ struct thread_options_pack {
uint32_t stonewall;
uint32_t new_group;
uint32_t numjobs;
+
/*
* We currently can't convert these, so don't enable them
*/
@@ -534,6 +559,8 @@ struct thread_options_pack {
uint32_t compress_percentage;
uint32_t compress_chunk;
uint32_t dedupe_percentage;
+ uint32_t dedupe_mode;
+ uint32_t dedupe_working_set_percentage;
uint32_t time_based;
uint32_t disable_lat;
uint32_t disable_clat;
@@ -609,12 +636,14 @@ struct thread_options_pack {
uint32_t gid;
uint32_t offset_increment_percent;
+ uint32_t offset_increment_nz;
uint64_t offset_increment;
uint64_t number_ios;
uint64_t latency_target;
uint64_t latency_window;
- uint64_t max_latency;
+ uint64_t max_latency[DDIR_RWDIR_CNT];
+ uint32_t pad5;
fio_fp64_t latency_percentile;
uint32_t latency_run;
@@ -640,6 +669,8 @@ struct thread_options_pack {
uint32_t allow_mounted_write;
uint32_t zone_mode;
+ int32_t max_open_zones;
+ uint32_t ignore_zone_limits;
} __attribute__((packed));
extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);
diff --git a/tools/fiograph/fiograph.conf b/tools/fiograph/fiograph.conf
new file mode 100644
index 00000000..5becc4d9
--- /dev/null
+++ b/tools/fiograph/fiograph.conf
@@ -0,0 +1,105 @@
+[fio_jobs]
+header=<<B><font color="{}"> {} </font></B> >
+header_color=black
+text_color=darkgreen
+shape=box
+shape_color=blue
+style=rounded
+title_style=<<table border='0' cellborder='0' cellspacing='1'> <tr> <td align='center'> <b> {} </b> </td> </tr>
+item_style=<tr> <td align = "left"> <font color="{}" > {} </font> </td> </tr>
+cluster_style=filled
+cluster_color=gainsboro
+
+[exec_prerun]
+text_color=red
+
+[exec_postrun]
+text_color=red
+
+[numjobs]
+text_color=red
+style=<font color="{}" > x {} </font>
+
+[ioengine]
+text_color=darkblue
+specific_options_color=darkblue
+
+# definitions of engine's specific options
+
+[ioengine_cpuio]
+specific_options=cpuload cpumode cpuchunks exit_on_io_done
+
+[ioengine_dfs]
+specific_options=pool cont chunk_size object_class svcl
+
+[ioengine_e4defrag]
+specific_options=donorname inplace
+
+[ioengine_exec]
+specific_options=program arguments grace_time std_redirect
+
+[ioengine_filestat]
+specific_options=stat_type
+
+[ioengine_single-instance]
+specific_options=volume brick
+
+[ioengine_http]
+specific_options=https http_host http_user http_pass http_s3_key http_s3_keyid http_swift_auth_token http_s3_region http_mode http_verbose
+
+[ioengine_ime_aio]
+specific_options=ime_psync ime_psyncv
+
+[ioengine_io_uring]
+specific_options=hipri cmdprio_percentage cmdprio_percentage fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored uncached nowait force_async
+
+[ioengine_libaio]
+specific_options=userspace_reap cmdprio_percentage cmdprio_percentage nowait
+
+[ioengine_libcufile]
+specific_options=gpu_dev_ids cuda_io
+
+[ioengine_libhdfs]
+specific_options=namenode hostname port hdfsdirectory chunk_size single_instance hdfs_use_direct
+
+[ioengine_libiscsi]
+specific_options=initiator
+
+[ioengine_librpma_apm_server]
+specific_options=librpma_apm_client
+
+[ioengine_busy_wait_polling]
+specific_options=serverip port direct_write_to_pmem
+
+[ioengine_librpma_gpspm_server]
+specific_options=librpma_gpspm_client
+
+[ioengine_mmap]
+specific_options=thp
+
+[ioengine_mtd]
+specific_options=skip_bad
+
+[ioengine_nbd]
+specific_options=uri
+
+[ioengine_net]
+specific_options=hostname port protocol nodelay listen pingpong interface ttl window_size mss netsplice
+
+[ioengine_nfs]
+specific_options=nfs_url
+
+[ioengine_rados]
+specific_options=clustername pool clientname busy_poll touch_objects
+
+[ioengine_rbd]
+specific_options=clustername rbdname pool clientname busy_poll
+
+[ioengine_rdma]
+specific_options=hostname bindname port verb
+
+[ioengine_sg]
+specific_options=hipri readfua writefua sg_write_mode sg
+
+[ioengine_pvsync2]
+specific_options=hipri hipri_percentage uncached nowait sync psync vsync pvsync
diff --git a/tools/fiograph/fiograph.py b/tools/fiograph/fiograph.py
new file mode 100755
index 00000000..7695c964
--- /dev/null
+++ b/tools/fiograph/fiograph.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python3
+from graphviz import Digraph
+import argparse
+import configparser
+import os
+
+config_file = None
+fio_file = None
+
+
+def get_section_option(section_name, option_name, default=None):
+ global fio_file
+ if fio_file.has_option(section_name, option_name):
+ return fio_file[section_name][option_name]
+ return default
+
+
+def get_config_option(section_name, option_name, default=None):
+ global config_file
+ if config_file.has_option(section_name, option_name):
+ return config_file[section_name][option_name]
+ return default
+
+
+def get_header_color(keyword='fio_jobs', default_color='black'):
+ return get_config_option(keyword, 'header_color', default_color)
+
+
+def get_shape_color(keyword='fio_jobs', default_color='black'):
+ return get_config_option(keyword, 'shape_color', default_color)
+
+
+def get_text_color(keyword='fio_jobs', default_color='black'):
+ return get_config_option(keyword, 'text_color', default_color)
+
+
+def get_cluster_color(keyword='fio_jobs', default_color='gray92'):
+ return get_config_option(keyword, 'cluster_color', default_color)
+
+
+def get_header(keyword='fio_jobs'):
+ return get_config_option(keyword, 'header')
+
+
+def get_shape(keyword='fio_jobs'):
+ return get_config_option(keyword, 'shape', 'box')
+
+
+def get_style(keyword='fio_jobs'):
+ return get_config_option(keyword, 'style', 'rounded')
+
+
+def get_cluster_style(keyword='fio_jobs'):
+ return get_config_option(keyword, 'cluster_style', 'filled')
+
+
+def get_specific_options(engine):
+ if not engine:
+ return ''
+ return get_config_option('ioengine_{}'.format(engine), 'specific_options', '').split(' ')
+
+
+def render_option(section, label, display, option, color_override=None):
+ # These options are already shown with graphical helpers, no need to report them directly
+ skip_list = ['size', 'stonewall', 'runtime', 'time_based',
+ 'numjobs', 'wait_for', 'wait_for_previous']
+ # If the option doesn't exist or if a special handling is already done
+ # don't render it, just return the current state
+ if option in skip_list or option not in section:
+ return label, display
+ display = option
+ if section[option]:
+ display = '{} = {}'.format(display, section[option])
+
+ # Adding jobs's options into the box, darkgreen is the default color
+ if color_override:
+ color = color_override
+ else:
+ color = get_text_color(option, get_text_color('fio_jobs', 'darkgreen'))
+ label += get_config_option('fio_jobs',
+ 'item_style').format(color, display)
+ return label, display
+
+
+def render_options(fio_file, section_name):
+ """Render all options of a section."""
+ display = section_name
+ section = fio_file[section_name]
+
+ # Add a multiplier to the section_name if numjobs is set
+ numjobs = int(get_section_option(section_name, 'numjobs', '1'))
+ if numjobs > 1:
+ display = display + \
+ get_style('numjobs').format(
+ get_text_color('numjobs'), numjobs)
+
+ # Header of the box
+ label = get_config_option('fio_jobs', 'title_style').format(display)
+
+ # Let's parse all the options of the current fio thread
+ # Some needs to be printed on top or bottom of the job to ease the read
+ to_early_print = ['exec_prerun', 'ioengine']
+ to_late_print = ['exec_postrun']
+
+ # Let's print the options on top of the box
+ for early_print in to_early_print:
+ label, display = render_option(
+ section, label, display, early_print)
+
+ current_io_engine = get_section_option(
+ section_name, 'ioengine', None)
+ if current_io_engine:
+ # Let's print all specifics options for this engine
+ for specific_option in sorted(get_specific_options(current_io_engine)):
+ label, display = render_option(
+ section, label, display, specific_option, get_config_option('ioengine', 'specific_options_color'))
+
+ # Let's print generic options sorted by name
+ for option in sorted(section):
+ if option in to_early_print or option in to_late_print or option in get_specific_options(current_io_engine):
+ continue
+ label, display = render_option(section, label, display, option)
+
+ # let's print options on the bottom of the box
+ for late_print in to_late_print:
+ label, display = render_option(
+ section, label, display, late_print)
+
+ # End of the box content
+ label += '</table>>'
+ return label
+
+
+def render_section(current_graph, fio_file, section_name, label):
+ """Render the section."""
+ attr = None
+ section = fio_file[section_name]
+
+ # Let's render the box associated to a job
+ current_graph.node(section_name, label,
+ shape=get_shape(),
+ color=get_shape_color(),
+ style=get_style())
+
+ # Let's report the duration of the jobs with a self-loop arrow
+ if 'runtime' in section and 'time_based' in section:
+ attr = 'runtime={}'.format(section['runtime'])
+ elif 'size' in section:
+ attr = 'size={}'.format(section['size'])
+ if attr:
+ current_graph.edge(section_name, section_name, attr)
+
+
+def create_sub_graph(name):
+ """Return a new graph."""
+ # We need to put 'cluster' in the name to ensure graphviz consider it as a cluster
+ cluster_name = 'cluster_' + name
+ # Unset the main graph labels to avoid a recopy in each subgraph
+ attr = {}
+ attr['label'] = ''
+ new_graph = Digraph(name=cluster_name, graph_attr=attr)
+ new_graph.attr(style=get_cluster_style(),
+ color=get_cluster_color())
+ return new_graph
+
+
+def create_legend():
+ """Return a legend."""
+ html_table = "<<table border='0' cellborder='1' cellspacing='0' cellpadding='4'>"
+ html_table += '<tr><td COLSPAN="2"><b>Legend</b></td></tr>'
+ legend_item = '<tr> <td>{}</td> <td><font color="{}">{}</font></td></tr>"'
+ legend_bgcolor_item = '<tr><td>{}</td><td BGCOLOR="{}"></td></tr>'
+ html_table += legend_item.format('numjobs',
+ get_text_color('numjobs'), 'x numjobs')
+ html_table += legend_item.format('generic option',
+ get_text_color(), 'generic option')
+ html_table += legend_item.format('ioengine option',
+ get_text_color('ioengine'), 'ioengine option')
+ html_table += legend_bgcolor_item.format('job', get_shape_color())
+ html_table += legend_bgcolor_item.format(
+ 'execution group', get_cluster_color())
+ html_table += '</table>>'
+ legend = Digraph('html_table')
+ legend.node('legend', shape='none', label=html_table)
+ return legend
+
+
+def fio_to_graphviz(filename, format):
+ """Compute the graphviz graph from the fio file."""
+
+ # Let's read the fio file
+ global fio_file
+ fio_file = configparser.RawConfigParser(
+ allow_no_value=True,
+ default_section="global",
+ inline_comment_prefixes="'#', ';'")
+ fio_file.read(filename)
+
+ # Prepare the main graph object
+ # Let's define the header of the document
+ attrs = {}
+ attrs['labelloc'] = 't'
+ attrs['label'] = get_header().format(
+ get_header_color(), os.path.basename(filename))
+ main_graph = Digraph(engine='dot', graph_attr=attrs, format=format)
+
+ # Let's add a legend
+ main_graph.subgraph(create_legend())
+
+ # By default all jobs are run in parallel and depends on "global"
+ depends_on = fio_file.default_section
+
+ # The previous section is by default the global section
+ previous_section = fio_file.default_section
+
+ current_graph = main_graph
+
+ # The first job will be a new execution group
+ new_execution_group = True
+
+ # Let's interate on all sections to create links between them
+ for section_name in fio_file.sections():
+ # The current section
+ section = fio_file[section_name]
+
+ # If the current section is waiting the previous job
+ if ('stonewall' or 'wait_for_previous') in section:
+ # let's remember what was the previous job we depend on
+ depends_on = previous_section
+ new_execution_group = True
+ elif 'wait_for' in section:
+ # This sections depends on a named section pointed by wait_for
+ depends_on = section['wait_for']
+ new_execution_group = True
+
+ if new_execution_group:
+ # Let's link the current graph with the main one
+ main_graph.subgraph(current_graph)
+ # Let's create a new graph to represent all the incoming jobs running at the same time
+ current_graph = create_sub_graph(section_name)
+
+ # Let's render the current section in its execution group
+ render_section(current_graph, fio_file, section_name,
+ render_options(fio_file, section_name))
+
+ # Let's trace the link between this job and the one it depends on
+ # If we depend on 'global', we can avoid doing adding an arrow as we don't want to see 'global'
+ if depends_on != fio_file.default_section:
+ current_graph.edge(depends_on, section_name)
+
+ # The current section become the parent of the next one
+ previous_section = section_name
+
+ # We are by default in the same execution group
+ new_execution_group = False
+
+ # The last subgraph isn't rendered yet
+ main_graph.subgraph(current_graph)
+
+ # Let's return the main graphviz object
+ return main_graph
+
+
+def setup_commandline():
+ "Prepare the command line."
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--file', action='store',
+ type=str,
+ required=True,
+ help='the fio file to graph')
+ parser.add_argument('--output', action='store',
+ type=str,
+ help='the output filename')
+ parser.add_argument('--format', action='store',
+ type=str,
+ default='png',
+ help='the output format')
+ parser.add_argument('--view', action='store_true',
+ default=False,
+ help='view the graph')
+ parser.add_argument('--keep', action='store_true',
+ default=False,
+ help='keep the graphviz script file')
+ parser.add_argument('--config', action='store',
+ type=str,
+ default='fiograph.conf',
+ help='the configuration filename')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ global config_file
+ args = setup_commandline()
+ output_file = args.file
+ if args.output is None:
+ output_file = output_file.replace('.fio', '')
+ config_file = configparser.RawConfigParser(allow_no_value=True)
+ config_file.read(args.config)
+ fio_to_graphviz(args.file, args.format).render(output_file, view=args.view)
+ if not args.keep:
+ os.remove(output_file)
+
+
+main()
diff --git a/tools/plot/fio2gnuplot b/tools/plot/fio2gnuplot
index 78ee82fb..d2dc81df 100755
--- a/tools/plot/fio2gnuplot
+++ b/tools/plot/fio2gnuplot
@@ -198,7 +198,7 @@ def compute_temp_file(fio_data_file,disk_perf,gnuplot_output_dir, min_time, max_
# Index will be used to remember what file was featuring what value
index=index+1
- time, perf, x, block_size = line[1]
+ time, perf, x, block_size = line[1][:4]
if (blk_size == 0):
try:
blk_size=int(block_size)
diff --git a/unittests/lib/num2str.c b/unittests/lib/num2str.c
index a3492a8d..8f12cf83 100644
--- a/unittests/lib/num2str.c
+++ b/unittests/lib/num2str.c
@@ -29,7 +29,7 @@ static void test_num2str(void)
char *str;
int i;
- for (i = 0; i < ARRAY_SIZE(testcases); ++i) {
+ for (i = 0; i < FIO_ARRAY_SIZE(testcases); ++i) {
p = &testcases[i];
str = num2str(p->num, p->maxlen, p->base, p->pow2, p->unit);
CU_ASSERT_STRING_EQUAL(str, p->expected);
diff --git a/verify.c b/verify.c
index a418c054..0e1e4639 100644
--- a/verify.c
+++ b/verify.c
@@ -1411,7 +1411,6 @@ static void *verify_async_thread(void *data)
ret = pthread_cond_wait(&td->verify_cond,
&td->io_u_lock);
if (ret) {
- pthread_mutex_unlock(&td->io_u_lock);
break;
}
}
diff --git a/zbd.c b/zbd.c
index 905c0c2b..43f12b45 100644
--- a/zbd.c
+++ b/zbd.c
@@ -32,6 +32,17 @@ int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
{
int ret;
+ if (f->filetype == FIO_TYPE_PIPE) {
+ log_err("zonemode=zbd does not support pipes\n");
+ return -EINVAL;
+ }
+
+ /* If regular file, always emulate zones inside the file. */
+ if (f->filetype == FIO_TYPE_FILE) {
+ *model = ZBD_NONE;
+ return 0;
+ }
+
if (td->io_ops && td->io_ops->get_zoned_model)
ret = td->io_ops->get_zoned_model(td, f, model);
else
@@ -114,6 +125,34 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
}
/**
+ * zbd_get_max_open_zones - Get the maximum number of open zones
+ * @td: FIO thread data
+ * @f: FIO file for which to get max open zones
+ * @max_open_zones: Upon success, result will be stored here.
+ *
+ * A @max_open_zones value set to zero means no limit.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ */
+int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ int ret;
+
+ if (td->io_ops && td->io_ops->get_max_open_zones)
+ ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
+ else
+ ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
+ if (ret < 0) {
+ td_verror(td, errno, "get max open zones failed");
+ log_err("%s: get max open zones failed (%d).\n",
+ f->file_name, errno);
+ }
+
+ return ret;
+}
+
+/**
* zbd_zone_idx - convert an offset into a zone number
* @f: file pointer.
* @offset: offset in bytes. If this offset is in the first zone_size bytes
@@ -132,15 +171,6 @@ static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset)
}
/**
- * zbd_zone_swr - Test whether a zone requires sequential writes
- * @z: zone info pointer.
- */
-static inline bool zbd_zone_swr(struct fio_zone_info *z)
-{
- return z->type == ZBD_ZONE_TYPE_SWR;
-}
-
-/**
* zbd_zone_end - Return zone end location
* @z: zone info pointer.
*/
@@ -171,11 +201,12 @@ static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
{
assert((required & 511) == 0);
- return zbd_zone_swr(z) &&
+ return z->has_wp &&
z->wp + required > zbd_zone_capacity_end(z);
}
-static void zone_lock(struct thread_data *td, struct fio_file *f, struct fio_zone_info *z)
+static void zone_lock(struct thread_data *td, const struct fio_file *f,
+ struct fio_zone_info *z)
{
struct zoned_block_device_info *zbd = f->zbd_info;
uint32_t nz = z - zbd->zone_info;
@@ -183,6 +214,8 @@ static void zone_lock(struct thread_data *td, struct fio_file *f, struct fio_zon
/* A thread should never lock zones outside its working area. */
assert(f->min_zone <= nz && nz < f->max_zone);
+ assert(z->has_wp);
+
/*
* Lock the io_u target zone. The zone will be unlocked if io_u offset
* is changed or when io_u completes and zbd_put_io() executed.
@@ -199,11 +232,26 @@ static void zone_lock(struct thread_data *td, struct fio_file *f, struct fio_zon
}
}
+static inline void zone_unlock(struct fio_zone_info *z)
+{
+ int ret;
+
+ assert(z->has_wp);
+ ret = pthread_mutex_unlock(&z->mutex);
+ assert(!ret);
+}
+
static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
{
return (uint64_t)(offset - f->file_offset) < f->io_size;
}
+static inline struct fio_zone_info *get_zone(const struct fio_file *f,
+ unsigned int zone_nr)
+{
+ return &f->zbd_info->zone_info[zone_nr];
+}
+
/* Verify whether direct I/O is used for all host-managed zoned drives. */
static bool zbd_using_direct_io(void)
{
@@ -235,7 +283,7 @@ static bool zbd_is_seq_job(struct fio_file *f)
zone_idx_b = zbd_zone_idx(f, f->file_offset);
zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size - 1);
for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
- if (zbd_zone_swr(&f->zbd_info->zone_info[zone_idx]))
+ if (get_zone(f, zone_idx)->has_wp)
return true;
return false;
@@ -276,9 +324,7 @@ static bool zbd_verify_sizes(void)
return false;
}
- if (td->o.zone_skip &&
- (td->o.zone_skip < td->o.zone_size ||
- td->o.zone_skip % td->o.zone_size)) {
+ if (td->o.zone_skip % td->o.zone_size) {
log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
f->file_name, (unsigned long long) td->o.zone_skip,
(unsigned long long) td->o.zone_size);
@@ -286,7 +332,7 @@ static bool zbd_verify_sizes(void)
}
zone_idx = zbd_zone_idx(f, f->file_offset);
- z = &f->zbd_info->zone_info[zone_idx];
+ z = get_zone(f, zone_idx);
if ((f->file_offset != z->start) &&
(td->o.td_ddir != TD_DDIR_READ)) {
new_offset = zbd_zone_end(z);
@@ -302,7 +348,7 @@ static bool zbd_verify_sizes(void)
f->file_offset = new_offset;
}
zone_idx = zbd_zone_idx(f, f->file_offset + f->io_size);
- z = &f->zbd_info->zone_info[zone_idx];
+ z = get_zone(f, zone_idx);
new_end = z->start;
if ((td->o.td_ddir != TD_DDIR_READ) &&
(f->file_offset + f->io_size != new_end)) {
@@ -316,10 +362,6 @@ static bool zbd_verify_sizes(void)
(unsigned long long) new_end - f->file_offset);
f->io_size = new_end - f->file_offset;
}
-
- f->min_zone = zbd_zone_idx(f, f->file_offset);
- f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size);
- assert(f->min_zone < f->max_zone);
}
}
@@ -330,20 +372,21 @@ static bool zbd_verify_bs(void)
{
struct thread_data *td;
struct fio_file *f;
- uint32_t zone_size;
int i, j, k;
for_each_td(td, i) {
for_each_file(td, f, j) {
+ uint64_t zone_size;
+
if (!f->zbd_info)
continue;
zone_size = f->zbd_info->zone_size;
- for (k = 0; k < ARRAY_SIZE(td->o.bs); k++) {
+ for (k = 0; k < FIO_ARRAY_SIZE(td->o.bs); k++) {
if (td->o.verify != VERIFY_NONE &&
zone_size % td->o.bs[k] != 0) {
- log_info("%s: block size %llu is not a divisor of the zone size %d\n",
+ log_info("%s: block size %llu is not a divisor of the zone size %llu\n",
f->file_name, td->o.bs[k],
- zone_size);
+ (unsigned long long)zone_size);
return false;
}
}
@@ -377,7 +420,7 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
int i;
if (zone_size == 0) {
- log_err("%s: Specifying the zone size is mandatory for regular block devices with --zonemode=zbd\n\n",
+ log_err("%s: Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd\n\n",
f->file_name);
return 1;
}
@@ -398,6 +441,12 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
return 1;
}
+ if (f->real_file_size < zone_size) {
+ log_err("%s: file/device size %"PRIu64" is smaller than zone size %"PRIu64"\n",
+ f->file_name, f->real_file_size, zone_size);
+ return -EINVAL;
+ }
+
nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
zbd_info = scalloc(1, sizeof(*zbd_info) +
(nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
@@ -415,6 +464,7 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
p->type = ZBD_ZONE_TYPE_SWR;
p->cond = ZBD_ZONE_COND_EMPTY;
p->capacity = zone_capacity;
+ p->has_wp = 1;
}
/* a sentinel */
p->start = nr_zones * zone_size;
@@ -443,7 +493,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
struct fio_zone_info *p;
uint64_t zone_size, offset;
struct zoned_block_device_info *zbd_info = NULL;
- int i, j, ret = 0;
+ int i, j, ret = -ENOMEM;
zones = calloc(ZBD_REPORT_MAX_ZONES, sizeof(struct zbd_zone));
if (!zones)
@@ -475,7 +525,6 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
zbd_info = scalloc(1, sizeof(*zbd_info) +
(nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
- ret = -ENOMEM;
if (!zbd_info)
goto out;
mutex_init_pshared(&zbd_info->mutex);
@@ -499,8 +548,17 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
p->wp = z->wp;
break;
}
+
+ switch (z->type) {
+ case ZBD_ZONE_TYPE_SWR:
+ p->has_wp = 1;
+ break;
+ default:
+ p->has_wp = 0;
+ }
p->type = z->type;
p->cond = z->cond;
+
if (j > 0 && p->start != p[-1].start + zone_size) {
log_info("%s: invalid zone data\n",
f->file_name);
@@ -512,8 +570,9 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
offset = z->start + z->len;
if (j >= nr_zones)
break;
- nrz = zbd_report_zones(td, f, offset,
- zones, ZBD_REPORT_MAX_ZONES);
+ nrz = zbd_report_zones(td, f, offset, zones,
+ min((uint32_t)(nr_zones - j),
+ ZBD_REPORT_MAX_ZONES));
if (nrz < 0) {
ret = nrz;
log_info("fio: report zones (offset %llu) failed for %s (%d).\n",
@@ -540,6 +599,55 @@ out:
return ret;
}
+static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f)
+{
+ struct zoned_block_device_info *zbd = f->zbd_info;
+ unsigned int max_open_zones;
+ int ret;
+
+ if (zbd->model != ZBD_HOST_MANAGED || td->o.ignore_zone_limits) {
+ /* Only host-managed devices have a max open limit */
+ zbd->max_open_zones = td->o.max_open_zones;
+ goto out;
+ }
+
+ /* If host-managed, get the max open limit */
+ ret = zbd_get_max_open_zones(td, f, &max_open_zones);
+ if (ret)
+ return ret;
+
+ if (!max_open_zones) {
+ /* No device limit */
+ zbd->max_open_zones = td->o.max_open_zones;
+ } else if (!td->o.max_open_zones) {
+ /* No user limit. Set limit to device limit */
+ zbd->max_open_zones = max_open_zones;
+ } else if (td->o.max_open_zones <= max_open_zones) {
+ /* Both user limit and dev limit. User limit not too large */
+ zbd->max_open_zones = td->o.max_open_zones;
+ } else {
+ /* Both user limit and dev limit. User limit too large */
+ td_verror(td, EINVAL,
+ "Specified --max_open_zones is too large");
+ log_err("Specified --max_open_zones (%d) is larger than max (%u)\n",
+ td->o.max_open_zones, max_open_zones);
+ return -EINVAL;
+ }
+
+out:
+ /* Ensure that the limit is not larger than FIO's internal limit */
+ if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
+ td_verror(td, EINVAL, "'max_open_zones' value is too large");
+ log_err("'max_open_zones' value is larger than %u\n", ZBD_MAX_OPEN_ZONES);
+ return -EINVAL;
+ }
+
+ dprint(FD_ZBD, "%s: using max open zones limit: %"PRIu32"\n",
+ f->file_name, zbd->max_open_zones);
+
+ return 0;
+}
+
/*
* Allocate zone information and store it into f->zbd_info if zonemode=zbd.
*
@@ -557,14 +665,16 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
return ret;
switch (zbd_model) {
- case ZBD_IGNORE:
- return 0;
case ZBD_HOST_AWARE:
case ZBD_HOST_MANAGED:
ret = parse_zone_info(td, f);
+ if (ret)
+ return ret;
break;
case ZBD_NONE:
ret = init_zone_info(td, f);
+ if (ret)
+ return ret;
break;
default:
td_verror(td, EINVAL, "Unsupported zoned model");
@@ -572,11 +682,16 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
return -EINVAL;
}
- if (ret == 0) {
- f->zbd_info->model = zbd_model;
- f->zbd_info->max_open_zones = td->o.max_open_zones;
+ assert(f->zbd_info);
+ f->zbd_info->model = zbd_model;
+
+ ret = zbd_set_max_open_zones(td, f);
+ if (ret) {
+ zbd_free_zone_info(f);
+ return ret;
}
- return ret;
+
+ return 0;
}
void zbd_free_zone_info(struct fio_file *f)
@@ -633,7 +748,7 @@ static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
struct fio_zone_info *z);
-int zbd_setup_files(struct thread_data *td)
+int zbd_init_files(struct thread_data *td)
{
struct fio_file *f;
int i;
@@ -642,6 +757,44 @@ int zbd_setup_files(struct thread_data *td)
if (zbd_init_zone_info(td, f))
return 1;
}
+ return 0;
+}
+
+void zbd_recalc_options_with_zone_granularity(struct thread_data *td)
+{
+ struct fio_file *f;
+ int i;
+
+ for_each_file(td, f, i) {
+ struct zoned_block_device_info *zbd = f->zbd_info;
+ // zonemode=strided doesn't get per-file zone size.
+ uint64_t zone_size = zbd ? zbd->zone_size : td->o.zone_size;
+
+ if (zone_size == 0)
+ continue;
+
+ if (td->o.size_nz > 0) {
+ td->o.size = td->o.size_nz * zone_size;
+ }
+ if (td->o.io_size_nz > 0) {
+ td->o.io_size = td->o.io_size_nz * zone_size;
+ }
+ if (td->o.start_offset_nz > 0) {
+ td->o.start_offset = td->o.start_offset_nz * zone_size;
+ }
+ if (td->o.offset_increment_nz > 0) {
+ td->o.offset_increment = td->o.offset_increment_nz * zone_size;
+ }
+ if (td->o.zone_skip_nz > 0) {
+ td->o.zone_skip = td->o.zone_skip_nz * zone_size;
+ }
+ }
+}
+
+int zbd_setup_files(struct thread_data *td)
+{
+ struct fio_file *f;
+ int i;
if (!zbd_using_direct_io()) {
log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
@@ -659,21 +812,44 @@ int zbd_setup_files(struct thread_data *td)
struct fio_zone_info *z;
int zi;
- if (!zbd)
- continue;
+ assert(zbd);
+
+ f->min_zone = zbd_zone_idx(f, f->file_offset);
+ f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size);
- zbd->max_open_zones = zbd->max_open_zones ?: ZBD_MAX_OPEN_ZONES;
+ /*
+ * When all zones in the I/O range are conventional, io_size
+ * can be smaller than zone size, making min_zone the same
+ * as max_zone. This is why the assert below needs to be made
+ * conditional.
+ */
+ if (zbd_is_seq_job(f))
+ assert(f->min_zone < f->max_zone);
if (td->o.max_open_zones > 0 &&
zbd->max_open_zones != td->o.max_open_zones) {
log_err("Different 'max_open_zones' values\n");
return 1;
}
- if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
- log_err("'max_open_zones' value is limited by %u\n", ZBD_MAX_OPEN_ZONES);
+
+ /*
+ * The per job max open zones limit cannot be used without a
+ * global max open zones limit. (As the tracking of open zones
+ * is disabled when there is no global max open zones limit.)
+ */
+ if (td->o.job_max_open_zones && !zbd->max_open_zones) {
+ log_err("'job_max_open_zones' cannot be used without a global open zones limit\n");
return 1;
}
+ /*
+ * zbd->max_open_zones is the global limit shared for all jobs
+ * that target the same zoned block device. Force sync the per
+ * thread global limit with the actual global limit. (The real
+ * per thread/job limit is stored in td->o.job_max_open_zones).
+ */
+ td->o.max_open_zones = zbd->max_open_zones;
+
for (zi = f->min_zone; zi < f->max_zone; zi++) {
z = &zbd->zone_info[zi];
if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
@@ -695,10 +871,10 @@ int zbd_setup_files(struct thread_data *td)
return 0;
}
-static unsigned int zbd_zone_nr(struct zoned_block_device_info *zbd_info,
- struct fio_zone_info *zone)
+static inline unsigned int zbd_zone_nr(const struct fio_file *f,
+ struct fio_zone_info *zone)
{
- return zone - zbd_info->zone_info;
+ return zone - f->zbd_info->zone_info;
}
/**
@@ -716,12 +892,16 @@ static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
{
uint64_t offset = z->start;
uint64_t length = (z+1)->start - offset;
+ uint64_t data_in_zone = z->wp - z->start;
int ret = 0;
+ if (!data_in_zone)
+ return 0;
+
assert(is_valid_offset(f, offset + length - 1));
dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", f->file_name,
- zbd_zone_nr(f->zbd_info, z));
+ zbd_zone_nr(f, z));
switch (f->zbd_info->model) {
case ZBD_HOST_AWARE:
case ZBD_HOST_MANAGED:
@@ -734,7 +914,8 @@ static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
}
pthread_mutex_lock(&f->zbd_info->mutex);
- f->zbd_info->sectors_with_data -= z->wp - z->start;
+ f->zbd_info->sectors_with_data -= data_in_zone;
+ f->zbd_info->wp_sectors_with_data -= data_in_zone;
pthread_mutex_unlock(&f->zbd_info->mutex);
z->wp = z->start;
z->verify_block = 0;
@@ -754,11 +935,8 @@ static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
if (f->zbd_info->open_zones[open_zone_idx] == zone_idx)
break;
}
- if (open_zone_idx == f->zbd_info->num_open_zones) {
- dprint(FD_ZBD, "%s: zone %d is not open\n",
- f->file_name, zone_idx);
+ if (open_zone_idx == f->zbd_info->num_open_zones)
return;
- }
dprint(FD_ZBD, "%s: closing zone %d\n", f->file_name, zone_idx);
memmove(f->zbd_info->open_zones + open_zone_idx,
@@ -767,7 +945,7 @@ static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
sizeof(f->zbd_info->open_zones[0]));
f->zbd_info->num_open_zones--;
td->num_open_zones--;
- f->zbd_info->zone_info[zone_idx].open = 0;
+ get_zone(f, zone_idx)->open = 0;
}
/*
@@ -776,45 +954,35 @@ static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
* @f: fio file for which to reset zones
* @zb: first zone to reset.
* @ze: first zone not to reset.
- * @all_zones: whether to reset all zones or only those zones for which the
- * write pointer is not a multiple of td->o.min_bs[DDIR_WRITE].
*/
static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
struct fio_zone_info *const zb,
- struct fio_zone_info *const ze, bool all_zones)
+ struct fio_zone_info *const ze)
{
struct fio_zone_info *z;
const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
- bool reset_wp;
int res = 0;
assert(min_bs);
dprint(FD_ZBD, "%s: examining zones %u .. %u\n", f->file_name,
- zbd_zone_nr(f->zbd_info, zb), zbd_zone_nr(f->zbd_info, ze));
+ zbd_zone_nr(f, zb), zbd_zone_nr(f, ze));
for (z = zb; z < ze; z++) {
- uint32_t nz = z - f->zbd_info->zone_info;
+ uint32_t nz = zbd_zone_nr(f, z);
- if (!zbd_zone_swr(z))
+ if (!z->has_wp)
continue;
zone_lock(td, f, z);
- if (all_zones) {
- pthread_mutex_lock(&f->zbd_info->mutex);
- zbd_close_zone(td, f, nz);
- pthread_mutex_unlock(&f->zbd_info->mutex);
-
- reset_wp = z->wp != z->start;
- } else {
- reset_wp = z->wp % min_bs != 0;
- }
- if (reset_wp) {
+ pthread_mutex_lock(&f->zbd_info->mutex);
+ zbd_close_zone(td, f, nz);