dedupe: allow to generate dedupe buffers from working set
authorBar David <Bar.David@dell.com>
Thu, 17 Jun 2021 12:39:58 +0000 (15:39 +0300)
committerBar David <bardavvid@gmail.com>
Thu, 15 Jul 2021 05:55:15 +0000 (08:55 +0300)
This commit introduced new dedupe generation mode "working_set".
Working set mode simulates a more realistic approach to deduped data,
in which deduped buffers are generated from pre-existing working set -
% size of the device or file.

In other words, dedupe is not usually expected to be close
in time with the source buffer, as well as source buffers
are usually composed of small subset of the entire file or device.

Signed-off-by: Bar David <bardavvid@gmail.com>
16 files changed:
DEDUPE-TODO [new file with mode: 0644]
HOWTO
Makefile
cconv.c
dedupe.c [new file with mode: 0644]
dedupe.h [new file with mode: 0644]
fio.1
fio.h
init.c
io_u.c
lib/rand.c
lib/rand.h
options.c
server.h
t/dedupe.c
thread_options.h

diff --git a/DEDUPE-TODO b/DEDUPE-TODO
new file mode 100644 (file)
index 0000000..1f3ee9d
--- /dev/null
@@ -0,0 +1,19 @@
+- Mixed buffers of dedupe-able and compressible data.
+  Major usecase in performance benchmarking of storage subsystems.
+
+- Shifted dedup-able data.
+  Allow for dedup buffer generation to shift contents by random number
+  of sectors (fill the gaps with uncompressible data). Some storage
+  subsystems modernized the deduplication detection algorithms to look
+  for shifted data as well. For example, some databases push a timestamp
+  on the prefix of written blocks, which makes the underlying data
+  dedup-able in different alignment. FIO should be able to simulate such
+  workload.
+
+- Generation of similar data (but not exact).
+  A rising trend in enterprise storage systems.
+  Generation of "similar" data means random uncompressible buffers
+  that differ by few(configurable number of) bits from each other.
+  The storage subsystem usually identifies the similar buffers using
+  locality-sensitive hashing or other methods.
+
diff --git a/HOWTO b/HOWTO
index 86fb296445f006e2f3416abe559bd5f0dc57ce19..a12bccba826d5c966b438581286e79b43eb8cb6c 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -1705,6 +1705,36 @@ Buffers and memory
        this option will also enable :option:`refill_buffers` to prevent every buffer
        being identical.
 
+.. option:: dedupe_mode=str
+
+       If ``dedupe_percentage=<int>`` is given, then this option controls how fio
+       generates the dedupe buffers.
+
+               **repeat**
+                       Generate dedupe buffers by repeating previous writes
+               **working_set**
+                       Generate dedupe buffers from working set
+
+       ``repeat`` is the default option for fio. Dedupe buffers are generated
+       by repeating previous unique write.
+
+       ``working_set`` is a more realistic workload.
+       With ``working_set``, ``dedupe_working_set_percentage=<int>`` should be provided.
+       Given that, fio will use the initial unique write buffers as its working set.
+       Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+       Note that by using ``working_set`` the dedupe percentage will converge
+       to the desired over time while ``repeat`` maintains the desired percentage
+       throughout the job.
+
+.. option:: dedupe_working_set_percentage=int
+
+       If ``dedupe_mode=<str>`` is set to ``working_set``, then this controls
+       the percentage of size of the file or device used as the buffers
+       fio will choose to generate the dedupe buffers from
+
+       Note that size needs to be explicitly provided and only 1 file per
+       job is supported
+
 .. option:: invalidate=bool
 
        Invalidate the buffer/page cache parts of the files to be used prior to
index f57569d5f66461f85a54a69e53d4f161a2d6fba8..f9096217c7f8b00fae6a5bb2967fd8b2537936c1 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -61,7 +61,7 @@ SOURCE :=     $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
                gettime-thread.c helpers.c json.c idletime.c td_error.c \
                profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
                workqueue.c rate-submit.c optgroup.c helper_thread.c \
-               steadystate.c zone-dist.c zbd.c
+               steadystate.c zone-dist.c zbd.c dedupe.c
 
 ifdef CONFIG_LIBHDFS
   HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
diff --git a/cconv.c b/cconv.c
index 74c241063abb6491586f18751502ff572f50edc0..e3a8c27cf63205bb7eb4eba5c3c0fc7dd703249e 100644 (file)
--- a/cconv.c
+++ b/cconv.c
@@ -298,6 +298,8 @@ void convert_thread_options_to_cpu(struct thread_options *o,
        o->compress_percentage = le32_to_cpu(top->compress_percentage);
        o->compress_chunk = le32_to_cpu(top->compress_chunk);
        o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
+       o->dedupe_mode = le32_to_cpu(top->dedupe_mode);
+       o->dedupe_working_set_percentage = le32_to_cpu(top->dedupe_working_set_percentage);
        o->block_error_hist = le32_to_cpu(top->block_error_hist);
        o->replay_align = le32_to_cpu(top->replay_align);
        o->replay_scale = le32_to_cpu(top->replay_scale);
@@ -499,6 +501,8 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
        top->compress_percentage = cpu_to_le32(o->compress_percentage);
        top->compress_chunk = cpu_to_le32(o->compress_chunk);
        top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
+       top->dedupe_mode = cpu_to_le32(o->dedupe_mode);
+       top->dedupe_working_set_percentage = cpu_to_le32(o->dedupe_working_set_percentage);
        top->block_error_hist = cpu_to_le32(o->block_error_hist);
        top->replay_align = cpu_to_le32(o->replay_align);
        top->replay_scale = cpu_to_le32(o->replay_scale);
diff --git a/dedupe.c b/dedupe.c
new file mode 100644 (file)
index 0000000..043a376
--- /dev/null
+++ b/dedupe.c
@@ -0,0 +1,28 @@
+#include "fio.h"
+
+int init_dedupe_working_set_seeds(struct thread_data *td)
+{
+       unsigned long long i;
+       struct frand_state dedupe_working_set_state = {0};
+
+       if (!td->o.dedupe_percentage || !(td->o.dedupe_mode == DEDUPE_MODE_WORKING_SET))
+               return 0;
+
+       /*
+        * The dedupe working set keeps seeds of unique data (generated by buf_state).
+        * Dedupe-ed pages will be generated using those seeds.
+        */
+       td->num_unique_pages = (td->o.size * (unsigned long long)td->o.dedupe_working_set_percentage / 100) / td->o.min_bs[DDIR_WRITE];
+       td->dedupe_working_set_states = malloc(sizeof(struct frand_state) * td->num_unique_pages);
+       if (!td->dedupe_working_set_states) {
+               log_err("fio: could not allocate dedupe working set\n");
+               return 1;
+       }
+       frand_copy(&dedupe_working_set_state, &td->buf_state);
+       for (i = 0; i < td->num_unique_pages; i++) {
+               frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state);
+               __get_next_seed(&dedupe_working_set_state);
+       }
+
+       return 0;
+}
diff --git a/dedupe.h b/dedupe.h
new file mode 100644 (file)
index 0000000..d4c4dc3
--- /dev/null
+++ b/dedupe.h
@@ -0,0 +1,6 @@
+#ifndef DEDUPE_H
+#define DEDUPE_H
+
+int init_dedupe_working_set_seeds(struct thread_data *td);
+
+#endif
diff --git a/fio.1 b/fio.1
index 5aa54a4d0471772276737edae926bba7b7f7e63b..bd315e1133e786b923ea6bcfc16e06e996563dd2 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -1509,6 +1509,48 @@ all \-\- this option only controls the distribution of unique buffers. Setting
 this option will also enable \fBrefill_buffers\fR to prevent every buffer
 being identical.
 .TP
+.BI dedupe_mode \fR=\fPstr
+If \fBdedupe_percentage\fR is given, then this option controls how fio
+generates the dedupe buffers.
+.RS
+.RS
+.TP
+.B repeat
+.P
+.RS
+Generate dedupe buffers by repeating previous writes
+.RE
+.TP
+.B working_set
+.P
+.RS
+Generate dedupe buffers from working set
+.RE
+.RE
+.P
+\fBrepeat\fR is the default option for fio. Dedupe buffers are generated
+by repeating previous unique write.
+
+\fBworking_set\fR is a more realistic workload.
+With \fBworking_set\fR, \fBdedupe_working_set_percentage\fR should be provided.
+Given that, fio will use the initial unique write buffers as its working set.
+Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+Note that by using \fBworking_set\fR the dedupe percentage will converge
+to the desired over time while \fBrepeat\fR maintains the desired percentage
+throughout the job.
+.RE
+.RE
+.TP
+.BI dedupe_working_set_percentage \fR=\fPint
+If \fBdedupe_mode\fR is set to \fBworking_set\fR, then this controls
+the percentage of size of the file or device used as the buffers
+fio will choose to generate the dedupe buffers from
+.P
+.RS
+Note that \fBsize\fR needs to be explicitly provided and only 1 file
+per job is supported
+.RE
+.TP
 .BI invalidate \fR=\fPbool
 Invalidate the buffer/page cache parts of the files to be used prior to
 starting I/O if the platform and file type support it. Defaults to true.
diff --git a/fio.h b/fio.h
index 83334652e61b79fb576ad972ee625f1d04512439..51686fd0f84c82fcc2a21d6fc57ba38ca5af3429 100644 (file)
--- a/fio.h
+++ b/fio.h
@@ -47,6 +47,7 @@
 #include "workqueue.h"
 #include "steadystate.h"
 #include "lib/nowarn_snprintf.h"
+#include "dedupe.h"
 
 #ifdef CONFIG_SOLARISAIO
 #include <sys/asynch.h>
@@ -140,6 +141,7 @@ enum {
        FIO_RAND_POISSON2_OFF,
        FIO_RAND_POISSON3_OFF,
        FIO_RAND_PRIO_CMDS,
+       FIO_RAND_DEDUPE_WORKING_SET_IX,
        FIO_RAND_NR_OFFS,
 };
 
@@ -263,6 +265,10 @@ struct thread_data {
        struct frand_state dedupe_state;
        struct frand_state zone_state;
        struct frand_state prio_state;
+       struct frand_state dedupe_working_set_index_state;
+       struct frand_state *dedupe_working_set_states;
+
+       unsigned long long num_unique_pages;
 
        struct zone_split_index **zone_state_index;
        unsigned int num_open_zones;
diff --git a/init.c b/init.c
index 60c7cff405d70d8e974545026e2fe659b512b7ed..871fb5ad1ff7e9daa902d40217fb75f000dbd736 100644 (file)
--- a/init.c
+++ b/init.c
@@ -958,6 +958,28 @@ static int fixup_options(struct thread_data *td)
 
        o->latency_target *= 1000ULL;
 
+       /*
+        * Dedupe working set verifications
+        */
+       if (o->dedupe_percentage && o->dedupe_mode == DEDUPE_MODE_WORKING_SET) {
+               if (!fio_option_is_set(o, size)) {
+                       log_err("fio: pregenerated dedupe working set "
+                                       "requires size to be set\n");
+                       ret |= 1;
+               } else if (o->nr_files != 1) {
+                       log_err("fio: dedupe working set mode supported with "
+                                       "single file per job, but %d files "
+                                       "provided\n", o->nr_files);
+                       ret |= 1;
+               } else if (o->dedupe_working_set_percentage + o->dedupe_percentage > 100) {
+                       log_err("fio: impossible to reach expected dedupe percentage %u "
+                                       "since %u percentage of size is reserved to dedupe working set "
+                                       "(those are unique pages)\n",
+                                       o->dedupe_percentage, o->dedupe_working_set_percentage);
+                       ret |= 1;
+               }
+       }
+
        return ret;
 }
 
@@ -1031,6 +1053,7 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
        init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
        init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
        init_rand_seed(&td->prio_state, td->rand_seeds[FIO_RAND_PRIO_CMDS], false);
+       init_rand_seed(&td->dedupe_working_set_index_state, td->rand_seeds[FIO_RAND_DEDUPE_WORKING_SET_IX], use64);
 
        if (!td_random(td))
                return;
@@ -1491,6 +1514,9 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
        if (fixup_options(td))
                goto err;
 
+       if (init_dedupe_working_set_seeds(td))
+               goto err;
+
        /*
         * Belongs to fixup_options, but o->name is not necessarily set as yet
         */
diff --git a/io_u.c b/io_u.c
index b60488a3020f53382907abc7653e7d927eac8d4c..9a1cd547d75f720bf321b51e8db818bd2764995b 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -2172,6 +2172,7 @@ void io_u_queued(struct thread_data *td, struct io_u *io_u)
 static struct frand_state *get_buf_state(struct thread_data *td)
 {
        unsigned int v;
+       unsigned long long i;
 
        if (!td->o.dedupe_percentage)
                return &td->buf_state;
@@ -2182,16 +2183,25 @@ static struct frand_state *get_buf_state(struct thread_data *td)
 
        v = rand_between(&td->dedupe_state, 1, 100);
 
-       if (v <= td->o.dedupe_percentage) {
-               /*
-                * The caller advances the returned frand_state.
-                * A copy of prev should be returned instead since
-                * a subsequent intention to generate a deduped buffer
-                * might result in generating a unique one
-                */
-               frand_copy(&td->buf_state_ret, &td->buf_state_prev);
-               return &td->buf_state_ret;
-       }
+       if (v <= td->o.dedupe_percentage)
+               switch (td->o.dedupe_mode) {
+               case DEDUPE_MODE_REPEAT:
+                       /*
+                       * The caller advances the returned frand_state.
+                       * A copy of prev should be returned instead since
+                       * a subsequent intention to generate a deduped buffer
+                       * might result in generating a unique one
+                       */
+                       frand_copy(&td->buf_state_ret, &td->buf_state_prev);
+                       return &td->buf_state_ret;
+               case DEDUPE_MODE_WORKING_SET:
+                       i = rand_between(&td->dedupe_working_set_index_state, 0, td->num_unique_pages - 1);
+                       frand_copy(&td->buf_state_ret, &td->dedupe_working_set_states[i]);
+                       return &td->buf_state_ret;
+               default:
+                       log_err("unexpected dedupe mode %u\n", td->o.dedupe_mode);
+                       assert(0);
+               }
 
        return &td->buf_state;
 }
index 5eb6e60aeb6b651e88595b4ba9c2d875d1a5748d..e74da609ac945dab2a7670beb593727ffa5d5fac 100644 (file)
@@ -125,10 +125,7 @@ void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
 uint64_t fill_random_buf(struct frand_state *fs, void *buf,
                         unsigned int len)
 {
-       uint64_t r = __rand(fs);
-
-       if (sizeof(int) != sizeof(long *))
-               r *= (unsigned long) __rand(fs);
+       uint64_t r = __get_next_seed(fs);
 
        __fill_random_buf(buf, len, r);
        return r;
@@ -188,10 +185,7 @@ uint64_t fill_random_buf_percentage(struct frand_state *fs, void *buf,
                                    unsigned int segment, unsigned int len,
                                    char *pattern, unsigned int pbytes)
 {
-       uint64_t r = __rand(fs);
-
-       if (sizeof(int) != sizeof(long *))
-               r *= (unsigned long) __rand(fs);
+       uint64_t r = __get_next_seed(fs);
 
        __fill_random_buf_percentage(r, buf, percentage, segment, len,
                                        pattern, pbytes);
index 46c1c5e023a132513ded05d885bd5769dbf7ceb9..a806004504013bc41986b6d1b6dbd6f6067b0a91 100644 (file)
@@ -150,6 +150,16 @@ static inline uint64_t rand_between(struct frand_state *state, uint64_t start,
                return start + rand32_upto(state, end - start);
 }
 
+static inline uint64_t __get_next_seed(struct frand_state *fs)
+{
+       uint64_t r = __rand(fs);
+
+       if (sizeof(int) != sizeof(long *))
+               r *= (unsigned long) __rand(fs);
+
+       return r;
+}
+
 extern void init_rand(struct frand_state *, bool);
 extern void init_rand_seed(struct frand_state *, uint64_t seed, bool);
 extern void __fill_random_buf(void *buf, unsigned int len, uint64_t seed);
index a8986d116716fb7f5df8e572caff0a6c8dfdc995..8c2ab7cc84d5e839050feadcedc64741b24a3b61 100644 (file)
--- a/options.c
+++ b/options.c
@@ -4497,6 +4497,40 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                .category = FIO_OPT_C_IO,
                .group  = FIO_OPT_G_IO_BUF,
        },
+       {
+               .name   = "dedupe_mode",
+               .lname  = "Dedupe mode",
+               .help   = "Mode for the deduplication buffer generation",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct thread_options, dedupe_mode),
+               .parent = "dedupe_percentage",
+               .def    = "repeat",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_IO_BUF,
+               .posval = {
+                          { .ival = "repeat",
+                            .oval = DEDUPE_MODE_REPEAT,
+                            .help = "repeat previous page",
+                          },
+                          { .ival = "working_set",
+                            .oval = DEDUPE_MODE_WORKING_SET,
+                            .help = "choose a page randomly from limited working set defined in dedupe_working_set_percentage",
+                          },
+               },
+       },
+       {
+               .name   = "dedupe_working_set_percentage",
+               .lname  = "Dedupe working set percentage",
+               .help   = "Dedupe working set size in percentages from file or device size used to generate dedupe patterns from",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, dedupe_working_set_percentage),
+               .parent = "dedupe_percentage",
+               .def    = "5",
+               .maxval = 100,
+               .minval = 0,
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_IO_BUF,
+       },
        {
                .name   = "clat_percentiles",
                .lname  = "Completion latency percentiles",
index c128df28adda112d04ad67549e1d0fa4194f5b1c..daed057acb855472805971d0620e57fbe79b7d40 100644 (file)
--- a/server.h
+++ b/server.h
@@ -48,7 +48,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-       FIO_SERVER_VER                  = 91,
+       FIO_SERVER_VER                  = 92,
 
        FIO_SERVER_MAX_FRAGMENT_PDU     = 1024,
        FIO_SERVER_MAX_CMD_MB           = 2048,
index 68d31f19bd7b6a028975ece96918e6f9a014c8c0..8b659c76c71caca57be3e80c9b4ed94077fb79a8 100644 (file)
@@ -473,11 +473,14 @@ static void show_chunk(struct chunk *c)
        }
 }
 
-static void show_stat(uint64_t nextents, uint64_t nchunks)
+static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents)
 {
        double perc, ratio;
 
-       printf("Extents=%lu, Unique extents=%lu\n", (unsigned long) nextents, (unsigned long) nchunks);
+       printf("Extents=%lu, Unique extents=%lu", (unsigned long) nextents, (unsigned long) nchunks);
+       if (!bloom)
+               printf(" Duplicated extents=%lu", (unsigned long) ndupextents);
+       printf("\n");
 
        if (nchunks) {
                ratio = (double) nextents / (double) nchunks;
@@ -485,17 +488,20 @@ static void show_stat(uint64_t nextents, uint64_t nchunks)
        } else
                printf("De-dupe ratio: 1:infinite\n");
 
+       if (ndupextents)
+               printf("De-dupe working set at least: %3.2f%%\n", 100.0 * (double) ndupextents / (double) nextents);
+
        perc = 1.00 - ((double) nchunks / (double) nextents);
        perc *= 100.0;
        printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
 
 }
 
-static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
+static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks, uint64_t *ndupextents)
 {
        struct fio_rb_node *n;
 
-       *nchunks = *nextents = 0;
+       *nchunks = *nextents = *ndupextents = 0;
 
        n = rb_first(&rb_root);
        if (!n)
@@ -507,6 +513,7 @@ static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
                c = rb_entry(n, struct chunk, rb_node);
                (*nchunks)++;
                *nextents += c->count;
+               *ndupextents += (c->count > 1);
 
                if (dump_output)
                        show_chunk(c);
@@ -530,7 +537,7 @@ static int usage(char *argv[])
 
 int main(int argc, char *argv[])
 {
-       uint64_t nextents = 0, nchunks = 0;
+       uint64_t nextents = 0, nchunks = 0, ndupextents = 0;
        int c, ret;
 
        arch_init(argv);
@@ -583,9 +590,9 @@ int main(int argc, char *argv[])
 
        if (!ret) {
                if (!bloom)
-                       iter_rb_tree(&nextents, &nchunks);
+                       iter_rb_tree(&nextents, &nchunks, &ndupextents);
 
-               show_stat(nextents, nchunks);
+               show_stat(nextents, nchunks, ndupextents);
        }
 
        fio_sem_remove(rb_lock);
index 05c2d1383e68403190d175608b995ece11d958fa..4b4ecfe104dbedb1f76fcee0aa31cb6dbf024cc3 100644 (file)
@@ -31,6 +31,14 @@ enum fio_memtype {
        MEM_CUDA_MALLOC,/* use GPU memory */
 };
 
+/*
+ * What mode to use for deduped data generation
+ */
+enum dedupe_mode {
+       DEDUPE_MODE_REPEAT = 0,
+       DEDUPE_MODE_WORKING_SET = 1,
+};
+
 #define ERROR_STR_MAX  128
 
 #define BSSPLIT_MAX    64
@@ -243,6 +251,8 @@ struct thread_options {
        unsigned int compress_percentage;
        unsigned int compress_chunk;
        unsigned int dedupe_percentage;
+       unsigned int dedupe_mode;
+       unsigned int dedupe_working_set_percentage;
        unsigned int time_based;
        unsigned int disable_lat;
        unsigned int disable_clat;
@@ -549,6 +559,8 @@ struct thread_options_pack {
        uint32_t compress_percentage;
        uint32_t compress_chunk;
        uint32_t dedupe_percentage;
+       uint32_t dedupe_mode;
+       uint32_t dedupe_working_set_percentage;
        uint32_t time_based;
        uint32_t disable_lat;
        uint32_t disable_clat;