fio: add serialize_overlap option
authorSitsofe Wheeler <sitsofe@yahoo.com>
Sun, 23 Apr 2017 21:54:54 +0000 (22:54 +0100)
committerSitsofe Wheeler <sitsofe@yahoo.com>
Mon, 14 Aug 2017 03:43:15 +0000 (04:43 +0100)
If this isn't set (the default) fio can submit write I/Os that overlap
other in-flight I/Os leading to potential data races. For example the
following job frequently fails at the verification stage:

./fio --random_distribution=zipf:1.6 --direct=1 --filename \
 /tmp/fiofile --ioengine=posixaio --iodepth=32 --size=20M --bs=4k \
 --rw=randwrite --verify=crc32c --name=verifyoverlap

When serialize_overlap=1 fio avoids creating such races.

Thanks to Rachel Lunnon (StorMagic) for helping me debug the initial
version of this!

Fixes: https://github.com/axboe/fio/issues/335

v2: Fix merge conflict and add missing conversion.
v3: Add man page, fix serialize_overlap disabling, improve commit
    message.

Tested-by: Jeff Furlong <jeff.furlong@wdc.com>
Signed-off-by: Sitsofe Wheeler <sitsofe@yahoo.com>
HOWTO
backend.c
cconv.c
fio.1
init.c
options.c
thread_options.h

diff --git a/HOWTO b/HOWTO
index fc173f0..8a7cb1a 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -2030,6 +2030,21 @@ I/O depth
        16 requests, it will let the depth drain down to 4 before starting to fill
        it again.
 
+.. option:: serialize_overlap=bool
+
+       Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+       When two or more I/Os are submitted simultaneously, there is no guarantee that
+       the I/Os will be processed or completed in the submitted order. Further, if
+       two or more of those I/Os are writes, any overlapping region between them can
+       become indeterminate/undefined on certain storage. These issues can cause
+       verification to fail erratically when at least one of the racing I/Os is
+       changing data and the overlapping region has a non-zero size. Setting
+       ``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly
+       serializing in-flight I/Os that have a non-zero overlap. Note that setting
+       this option can reduce both performance and the `:option:iodepth` achieved.
+       Additionally this option does not work when :option:`io_submit_mode` is set to
+       offload. Default: false.
+
 .. option:: io_submit_mode=str
 
        This option controls how fio submits the I/O to the I/O engine. The default
@@ -2605,7 +2620,6 @@ Verification
 
        Enable experimental verification.
 
-
 Steady state
 ~~~~~~~~~~~~
 
index fe15997..f003761 100644 (file)
--- a/backend.c
+++ b/backend.c
@@ -586,6 +586,37 @@ static int unlink_all_files(struct thread_data *td)
        return ret;
 }
 
+/*
+ * Check if io_u will overlap an in-flight IO in the queue
+ */
+static bool in_flight_overlap(struct io_u_queue *q, struct io_u *io_u)
+{
+       bool overlap;
+       struct io_u *check_io_u;
+       unsigned long long x1, x2, y1, y2;
+       int i;
+
+       x1 = io_u->offset;
+       x2 = io_u->offset + io_u->buflen;
+       overlap = false;
+       io_u_qiter(q, check_io_u, i) {
+               if (check_io_u->flags & IO_U_F_FLIGHT) {
+                       y1 = check_io_u->offset;
+                       y2 = check_io_u->offset + check_io_u->buflen;
+
+                       if (x1 < y2 && y1 < x2) {
+                               overlap = true;
+                               dprint(FD_IO, "in-flight overlap: %llu/%lu, %llu/%lu\n",
+                                               x1, io_u->buflen,
+                                               y1, check_io_u->buflen);
+                               break;
+                       }
+               }
+       }
+
+       return overlap;
+}
+
 /*
  * The main verify engine. Runs over the writes we previously submitted,
  * reads the blocks back in, and checks the crc/md5 of the data.
@@ -716,7 +747,13 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes)
                if (!td->o.disable_slat)
                        fio_gettime(&io_u->start_time, NULL);
 
-               ret = td_io_queue(td, io_u);
+               if (td->o.serialize_overlap && td->cur_depth > 0) {
+                       if (in_flight_overlap(&td->io_u_all, io_u))
+                               ret = FIO_Q_BUSY;
+                       else
+                               ret = td_io_queue(td, io_u);
+               } else
+                       ret = td_io_queue(td, io_u);
 
                if (io_queue_event(td, io_u, &ret, ddir, NULL, 1, NULL))
                        break;
@@ -983,7 +1020,13 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
                                td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
 
                } else {
-                       ret = td_io_queue(td, io_u);
+                       if (td->o.serialize_overlap && td->cur_depth > 0) {
+                               if (in_flight_overlap(&td->io_u_all, io_u))
+                                       ret = FIO_Q_BUSY;
+                               else
+                                       ret = td_io_queue(td, io_u);
+                       } else
+                               ret = td_io_queue(td, io_u);
 
                        if (should_check_rate(td))
                                td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
diff --git a/cconv.c b/cconv.c
index f9f2b30..ac58705 100644 (file)
--- a/cconv.c
+++ b/cconv.c
@@ -96,6 +96,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
        o->iodepth_batch = le32_to_cpu(top->iodepth_batch);
        o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min);
        o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max);
+       o->serialize_overlap = le32_to_cpu(top->serialize_overlap);
        o->size = le64_to_cpu(top->size);
        o->io_size = le64_to_cpu(top->io_size);
        o->size_percent = le32_to_cpu(top->size_percent);
@@ -346,6 +347,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
        top->iodepth_batch = cpu_to_le32(o->iodepth_batch);
        top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min);
        top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max);
+       top->serialize_overlap = cpu_to_le32(o->serialize_overlap);
        top->size_percent = cpu_to_le32(o->size_percent);
        top->fill_device = cpu_to_le32(o->fill_device);
        top->file_append = cpu_to_le32(o->file_append);
diff --git a/fio.1 b/fio.1
index a3fba65..14359e6 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -1044,6 +1044,20 @@ we simply do polling.
 Low watermark indicating when to start filling the queue again.  Default:
 \fBiodepth\fR.
 .TP
+.BI serialize_overlap \fR=\fPbool
+Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+When two or more I/Os are submitted simultaneously, there is no guarantee that
+the I/Os will be processed or completed in the submitted order. Further, if
+two or more of those I/Os are writes, any overlapping region between them can
+become indeterminate/undefined on certain storage. These issues can cause
+verification to fail erratically when at least one of the racing I/Os is
+changing data and the overlapping region has a non-zero size. Setting
+\fBserialize_overlap\fR tells fio to avoid provoking this behavior by explicitly
+serializing in-flight I/Os that have a non-zero overlap. Note that setting
+this option can reduce both performance and the \fBiodepth\fR achieved.
+Additionally this option does not work when \fBio_submit_mode\fR is set to
+offload. Default: false.
+.TP
 .BI io_submit_mode \fR=\fPstr
 This option controls how fio submits the IO to the IO engine. The default is
 \fBinline\fR, which means that the fio job threads submit and reap IO directly.
diff --git a/init.c b/init.c
index 42e7107..164e411 100644 (file)
--- a/init.c
+++ b/init.c
@@ -698,6 +698,23 @@ static int fixup_options(struct thread_data *td)
        if (o->iodepth_batch_complete_min > o->iodepth_batch_complete_max)
                o->iodepth_batch_complete_max = o->iodepth_batch_complete_min;
 
+       /*
+        * There's no need to check for in-flight overlapping IOs if the job
+        * isn't changing data or the maximum iodepth is guaranteed to be 1
+        */
+       if (o->serialize_overlap && !(td->flags & TD_F_READ_IOLOG) &&
+           (!(td_write(td) || td_trim(td)) || o->iodepth == 1))
+               o->serialize_overlap = 0;
+       /*
+        * Currently can't check for overlaps in offload mode
+        */
+       if (o->serialize_overlap && o->io_submit_mode == IO_MODE_OFFLOAD) {
+               log_err("fio: checking for in-flight overlaps when the "
+                       "io_submit_mode is offload is not supported\n");
+               o->serialize_overlap = 0;
+               ret = warnings_fatal;
+       }
+
        if (o->nr_files > td->files_index)
                o->nr_files = td->files_index;
 
index f2b2bb9..443791a 100644 (file)
--- a/options.c
+++ b/options.c
@@ -1881,6 +1881,17 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                .category = FIO_OPT_C_IO,
                .group  = FIO_OPT_G_IO_BASIC,
        },
+       {
+               .name   = "serialize_overlap",
+               .lname  = "Serialize overlap",
+               .off1   = offsetof(struct thread_options, serialize_overlap),
+               .type   = FIO_OPT_BOOL,
+               .help   = "Wait for in-flight IOs that collide to complete",
+               .parent = "iodepth",
+               .def    = "0",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_IO_BASIC,
+       },
        {
                .name   = "io_submit_mode",
                .lname  = "IO submit mode",
index f3dfd42..26a3e0e 100644 (file)
@@ -65,6 +65,7 @@ struct thread_options {
        unsigned int iodepth_batch;
        unsigned int iodepth_batch_complete_min;
        unsigned int iodepth_batch_complete_max;
+       unsigned int serialize_overlap;
 
        unsigned int unique_filename;
 
@@ -340,6 +341,8 @@ struct thread_options_pack {
        uint32_t iodepth_batch;
        uint32_t iodepth_batch_complete_min;
        uint32_t iodepth_batch_complete_max;
+       uint32_t serialize_overlap;
+       uint32_t pad3;
 
        uint64_t size;
        uint64_t io_size;