fio-dedup: adjusted the binary to support compression
authorBar David <bardavvid@gmail.com>
Wed, 10 Nov 2021 06:56:47 +0000 (08:56 +0200)
committerBar David <bardavvid@gmail.com>
Sun, 21 Nov 2021 07:25:22 +0000 (09:25 +0200)
When given -C 1 option, fio-dedup will first look for
dedupable data and then calculate compressible (unique)
data opportunity. The rationale is to measure the total
data reduction potential.

Signed-off-by: Bar David <bardavvid@gmail.com>
t/dedupe.c

index 8b659c76c71caca57be3e80c9b4ed94077fb79a8..5f72ed76cf27a561a5f52483156b14460ca4c766 100644 (file)
 
 #include "../lib/bloom.h"
 #include "debug.h"
+#include "zlib.h"
+
+struct zlib_ctrl {
+       z_stream stream;
+       unsigned char *buf_in;
+       unsigned char *buf_out;
+};
 
 struct worker_thread {
+       struct zlib_ctrl zc;
        pthread_t thread;
-
-       volatile int done;
-
-       int fd;
        uint64_t cur_offset;
        uint64_t size;
-
+       unsigned long long unique_capacity;
        unsigned long items;
        unsigned long dupes;
        int err;
+       int fd;
+       volatile int done;
 };
 
 struct extent {
@@ -68,6 +74,7 @@ static unsigned int odirect;
 static unsigned int collision_check;
 static unsigned int print_progress = 1;
 static unsigned int use_bloom = 1;
+static unsigned int compression = 0;
 
 static uint64_t total_size;
 static uint64_t cur_offset;
@@ -135,6 +142,34 @@ static int read_block(int fd, void *buf, off_t offset)
        return __read_block(fd, buf, offset, blocksize);
 }
 
+static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity, struct zlib_ctrl *zc)
+{
+       z_stream *stream = &zc->stream;
+       unsigned int compressed_len;
+       int ret;
+
+       if (read_block(file.fd, zc->buf_in, offset))
+               return;
+
+       stream->next_in = zc->buf_in;
+       stream->avail_in = blocksize;
+       stream->avail_out = deflateBound(stream, blocksize);
+       stream->next_out = zc->buf_out;
+
+       ret = deflate(stream, Z_FINISH);
+       assert(ret != Z_STREAM_ERROR);
+       compressed_len = blocksize - stream->avail_out;
+
+       if (dump_output)
+               printf("offset 0x%lx compressed to %d blocksize %d ratio %.2f \n",
+                               (unsigned long) offset, compressed_len, blocksize,
+                               (float)compressed_len / (float)blocksize);
+
+       *unique_capacity += compressed_len;
+
+       deflateReset(stream);
+}
+
 static void add_item(struct chunk *c, struct item *i)
 {
        /*      
@@ -188,7 +223,7 @@ static struct chunk *alloc_chunk(void)
        return c;
 }
 
-static void insert_chunk(struct item *i)
+static void insert_chunk(struct item *i, uint64_t *unique_capacity, struct zlib_ctrl *zc)
 {
        struct fio_rb_node **p, *parent;
        struct chunk *c;
@@ -228,12 +263,14 @@ static void insert_chunk(struct item *i)
        memcpy(c->hash, i->hash, sizeof(i->hash));
        rb_link_node(&c->rb_node, parent, p);
        rb_insert_color(&c->rb_node, &rb_root);
+       if (compression)
+               account_unique_capacity(i->offset, unique_capacity, zc);
 add:
        add_item(c, i);
 }
 
 static void insert_chunks(struct item *items, unsigned int nitems,
-                         uint64_t *ndupes)
+                         uint64_t *ndupes, uint64_t *unique_capacity, struct zlib_ctrl *zc)
 {
        int i;
 
@@ -248,7 +285,7 @@ static void insert_chunks(struct item *items, unsigned int nitems,
                        r = bloom_set(bloom, items[i].hash, s);
                        *ndupes += r;
                } else
-                       insert_chunk(&items[i]);
+                       insert_chunk(&items[i], unique_capacity, zc);
        }
 
        fio_sem_up(rb_lock);
@@ -277,6 +314,7 @@ static int do_work(struct worker_thread *thread, void *buf)
        off_t offset;
        int nitems = 0;
        uint64_t ndupes = 0;
+       uint64_t unique_capacity = 0;
        struct item *items;
 
        offset = thread->cur_offset;
@@ -296,14 +334,29 @@ static int do_work(struct worker_thread *thread, void *buf)
                nitems++;
        }
 
-       insert_chunks(items, nitems, &ndupes);
+       insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc);
 
        free(items);
        thread->items += nitems;
        thread->dupes += ndupes;
+       thread->unique_capacity += unique_capacity;
        return 0;
 }
 
+static void thread_init_zlib_control(struct worker_thread *thread)
+{
+       z_stream *stream = &thread->zc.stream;
+       stream->zalloc = Z_NULL;
+       stream->zfree = Z_NULL;
+       stream->opaque = Z_NULL;
+
+       if (deflateInit(stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+               return;
+
+       thread->zc.buf_in = fio_memalign(blocksize, blocksize, false);
+       thread->zc.buf_out = fio_memalign(blocksize, deflateBound(stream, blocksize), false);
+}
+
 static void *thread_fn(void *data)
 {
        struct worker_thread *thread = data;
@@ -311,6 +364,8 @@ static void *thread_fn(void *data)
 
        buf = fio_memalign(blocksize, chunk_size, false);
 
+       thread_init_zlib_control(thread);
+
        do {
                if (get_work(&thread->cur_offset, &thread->size)) {
                        thread->err = 1;
@@ -370,7 +425,7 @@ static void show_progress(struct worker_thread *threads, unsigned long total)
 }
 
 static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
-                             uint64_t *nextents, uint64_t *nchunks)
+                             uint64_t *nextents, uint64_t *nchunks, uint64_t *unique_capacity)
 {
        struct worker_thread *threads;
        unsigned long nitems, total_items;
@@ -398,11 +453,13 @@ static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
        nitems = 0;
        *nextents = 0;
        *nchunks = 1;
+       *unique_capacity = 0;
        for (i = 0; i < num_threads; i++) {
                void *ret;
                pthread_join(threads[i].thread, &ret);
                nitems += threads[i].items;
                *nchunks += threads[i].dupes;
+               *unique_capacity += threads[i].unique_capacity;
        }
 
        printf("Threads(%u): %lu items processed\n", num_threads, nitems);
@@ -416,7 +473,7 @@ static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
 }
 
 static int dedupe_check(const char *filename, uint64_t *nextents,
-                       uint64_t *nchunks)
+                       uint64_t *nchunks, uint64_t *unique_capacity)
 {
        uint64_t dev_size;
        struct stat sb;
@@ -453,7 +510,7 @@ static int dedupe_check(const char *filename, uint64_t *nextents,
 
        printf("Will check <%s>, size <%llu>, using %u threads\n", filename, (unsigned long long) dev_size, num_threads);
 
-       return run_dedupe_threads(&file, dev_size, nextents, nchunks);
+       return run_dedupe_threads(&file, dev_size, nextents, nchunks, unique_capacity);
 err:
        if (file.fd != -1)
                close(file.fd);
@@ -473,9 +530,27 @@ static void show_chunk(struct chunk *c)
        }
 }
 
-static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents)
+static const char *capacity_unit[] = {"b","KB", "MB", "GB", "TB", "PB", "EB"};
+
+static uint64_t bytes_to_human_readable_unit(uint64_t n, const char **unit_out)
+{
+       uint8_t i = 0;
+
+       while (n >= 1024) {
+               i++;
+               n /= 1024;
+       }
+
+       *unit_out = capacity_unit[i];
+
+       return n;
+}
+
+static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents, uint64_t unique_capacity)
 {
        double perc, ratio;
+       const char *unit;
+       uint64_t uc_human;
 
        printf("Extents=%lu, Unique extents=%lu", (unsigned long) nextents, (unsigned long) nchunks);
        if (!bloom)
@@ -495,12 +570,16 @@ static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents)
        perc *= 100.0;
        printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
 
+
+       if (compression) {
+               uc_human = bytes_to_human_readable_unit(unique_capacity, &unit);
+               printf("Unique capacity %lu%s\n", (unsigned long) uc_human, unit);
+       }
 }
 
 static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks, uint64_t *ndupextents)
 {
        struct fio_rb_node *n;
-
        *nchunks = *nextents = *ndupextents = 0;
 
        n = rb_first(&rb_root);
@@ -532,18 +611,19 @@ static int usage(char *argv[])
        log_err("\t-c\tFull collision check\n");
        log_err("\t-B\tUse probabilistic bloom filter\n");
        log_err("\t-p\tPrint progress indicator\n");
+       log_err("\t-C\tCalculate compressible size\n");
        return 1;
 }
 
 int main(int argc, char *argv[])
 {
-       uint64_t nextents = 0, nchunks = 0, ndupextents = 0;
+       uint64_t nextents = 0, nchunks = 0, ndupextents = 0, unique_capacity;
        int c, ret;
 
        arch_init(argv);
        debug_init();
 
-       while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
+       while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:C:")) != -1) {
                switch (c) {
                case 'b':
                        blocksize = atoi(optarg);
@@ -566,13 +646,16 @@ int main(int argc, char *argv[])
                case 'B':
                        use_bloom = atoi(optarg);
                        break;
+               case 'C':
+                       compression = atoi(optarg);
+                       break;
                case '?':
                default:
                        return usage(argv);
                }
        }
 
-       if (collision_check || dump_output)
+       if (collision_check || dump_output || compression)
                use_bloom = 0;
 
        if (!num_threads)
@@ -586,13 +669,13 @@ int main(int argc, char *argv[])
        rb_root = RB_ROOT;
        rb_lock = fio_sem_init(FIO_SEM_UNLOCKED);
 
-       ret = dedupe_check(argv[optind], &nextents, &nchunks);
+       ret = dedupe_check(argv[optind], &nextents, &nchunks, &unique_capacity);
 
        if (!ret) {
                if (!bloom)
                        iter_rb_tree(&nextents, &nchunks, &ndupextents);
 
-               show_stat(nextents, nchunks, ndupextents);
+               show_stat(nextents, nchunks, ndupextents, unique_capacity);
        }
 
        fio_sem_remove(rb_lock);