#include "../lib/bloom.h"
#include "debug.h"
+#include "zlib.h"
+
+struct zlib_ctrl {
+ z_stream stream;
+ unsigned char *buf_in;
+ unsigned char *buf_out;
+};
struct worker_thread {
+ struct zlib_ctrl zc;
pthread_t thread;
-
- volatile int done;
-
- int fd;
uint64_t cur_offset;
uint64_t size;
-
+ unsigned long long unique_capacity;
unsigned long items;
unsigned long dupes;
int err;
+ int fd;
+ volatile int done;
};
struct extent {
static unsigned int collision_check;
static unsigned int print_progress = 1;
static unsigned int use_bloom = 1;
+static unsigned int compression = 0;
static uint64_t total_size;
static uint64_t cur_offset;
return 0;
}
ret = bytes;
- } else
+ } else {
ret = sb->st_size;
+ }
return (ret & ~((uint64_t)blocksize - 1));
}
if (ret < 0) {
perror("pread");
return 1;
- } else if (!ret)
+ } else if (!ret) {
return 1;
- else if (ret != count) {
+ } else if (ret != count) {
log_err("dedupe: short read on block\n");
return 1;
}
return __read_block(fd, buf, offset, blocksize);
}
+static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity,
+ struct zlib_ctrl *zc)
+{
+ z_stream *stream = &zc->stream;
+ unsigned int compressed_len;
+ int ret;
+
+ if (read_block(file.fd, zc->buf_in, offset))
+ return;
+
+ stream->next_in = zc->buf_in;
+ stream->avail_in = blocksize;
+ stream->avail_out = deflateBound(stream, blocksize);
+ stream->next_out = zc->buf_out;
+
+ ret = deflate(stream, Z_FINISH);
+ assert(ret != Z_STREAM_ERROR);
+ compressed_len = blocksize - stream->avail_out;
+
+ if (dump_output)
+ printf("offset 0x%lx compressed to %d blocksize %d ratio %.2f \n",
+ (unsigned long) offset, compressed_len, blocksize,
+ (float)compressed_len / (float)blocksize);
+
+ *unique_capacity += compressed_len;
+ deflateReset(stream);
+}
+
static void add_item(struct chunk *c, struct item *i)
{
/*
if (collision_check || dump_output) {
c = malloc(sizeof(struct chunk) + sizeof(struct flist_head));
INIT_FLIST_HEAD(&c->extent_list[0]);
- } else
+ } else {
c = malloc(sizeof(struct chunk));
+ }
return c;
}
-static void insert_chunk(struct item *i)
+static void insert_chunk(struct item *i, uint64_t *unique_capacity,
+ struct zlib_ctrl *zc)
{
struct fio_rb_node **p, *parent;
struct chunk *c;
c = rb_entry(parent, struct chunk, rb_node);
diff = memcmp(i->hash, c->hash, sizeof(i->hash));
- if (diff < 0)
+ if (diff < 0) {
p = &(*p)->rb_left;
- else if (diff > 0)
+ } else if (diff > 0) {
p = &(*p)->rb_right;
- else {
+ } else {
int ret;
if (!collision_check)
memcpy(c->hash, i->hash, sizeof(i->hash));
rb_link_node(&c->rb_node, parent, p);
rb_insert_color(&c->rb_node, &rb_root);
+ if (compression)
+ account_unique_capacity(i->offset, unique_capacity, zc);
add:
add_item(c, i);
}
static void insert_chunks(struct item *items, unsigned int nitems,
- uint64_t *ndupes)
+ uint64_t *ndupes, uint64_t *unique_capacity,
+ struct zlib_ctrl *zc)
{
int i;
r = bloom_set(bloom, items[i].hash, s);
*ndupes += r;
} else
- insert_chunk(&items[i]);
+ insert_chunk(&items[i], unique_capacity, zc);
}
fio_sem_up(rb_lock);
off_t offset;
int nitems = 0;
uint64_t ndupes = 0;
+ uint64_t unique_capacity = 0;
struct item *items;
offset = thread->cur_offset;
- nblocks = read_blocks(thread->fd, buf, offset, min(thread->size, (uint64_t)chunk_size));
+ nblocks = read_blocks(thread->fd, buf, offset,
+ min(thread->size, (uint64_t) chunk_size));
if (!nblocks)
return 1;
nitems++;
}
- insert_chunks(items, nitems, &ndupes);
+ insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc);
free(items);
thread->items += nitems;
thread->dupes += ndupes;
+ thread->unique_capacity += unique_capacity;
return 0;
}
+static void thread_init_zlib_control(struct worker_thread *thread)
+{
+ size_t sz;
+
+ z_stream *stream = &thread->zc.stream;
+ stream->zalloc = Z_NULL;
+ stream->zfree = Z_NULL;
+ stream->opaque = Z_NULL;
+
+ if (deflateInit(stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+ return;
+
+ thread->zc.buf_in = fio_memalign(blocksize, blocksize, false);
+ sz = deflateBound(stream, blocksize);
+ thread->zc.buf_out = fio_memalign(blocksize, sz, false);
+}
+
static void *thread_fn(void *data)
{
struct worker_thread *thread = data;
void *buf;
buf = fio_memalign(blocksize, chunk_size, false);
+ thread_init_zlib_control(thread);
do {
if (get_work(&thread->cur_offset, &thread->size)) {
printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items);
last_nitems = nitems;
fio_gettime(&last_tv, NULL);
- } else
+ } else {
printf("%3.2f%% done\r", perc);
+ }
fflush(stdout);
usleep(250000);
};
}
static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
- uint64_t *nextents, uint64_t *nchunks)
+ uint64_t *nextents, uint64_t *nchunks,
+ uint64_t *unique_capacity)
{
struct worker_thread *threads;
unsigned long nitems, total_items;
nitems = 0;
*nextents = 0;
*nchunks = 1;
+ *unique_capacity = 0;
for (i = 0; i < num_threads; i++) {
void *ret;
pthread_join(threads[i].thread, &ret);
nitems += threads[i].items;
*nchunks += threads[i].dupes;
+ *unique_capacity += threads[i].unique_capacity;
}
printf("Threads(%u): %lu items processed\n", num_threads, nitems);
}
static int dedupe_check(const char *filename, uint64_t *nextents,
- uint64_t *nchunks)
+ uint64_t *nchunks, uint64_t *unique_capacity)
{
uint64_t dev_size;
struct stat sb;
bloom = bloom_new(bloom_entries);
}
- printf("Will check <%s>, size <%llu>, using %u threads\n", filename, (unsigned long long) dev_size, num_threads);
+ printf("Will check <%s>, size <%llu>, using %u threads\n", filename,
+ (unsigned long long) dev_size, num_threads);
- return run_dedupe_threads(&file, dev_size, nextents, nchunks);
+ return run_dedupe_threads(&file, dev_size, nextents, nchunks,
+ unique_capacity);
err:
if (file.fd != -1)
close(file.fd);
struct flist_head *n;
struct extent *e;
- printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1], c->hash[2], c->hash[3], (unsigned long) c->count);
+ printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1],
+ c->hash[2], c->hash[3], (unsigned long) c->count);
flist_for_each(n, &c->extent_list[0]) {
e = flist_entry(n, struct extent, list);
printf("\toffset %llu\n", (unsigned long long) e->offset);
}
}
-static void show_stat(uint64_t nextents, uint64_t nchunks)
+static const char *capacity_unit[] = {"b","KB", "MB", "GB", "TB", "PB", "EB"};
+
+static uint64_t bytes_to_human_readable_unit(uint64_t n, const char **unit_out)
+{
+ uint8_t i = 0;
+
+ while (n >= 1024) {
+ i++;
+ n /= 1024;
+ }
+
+ *unit_out = capacity_unit[i];
+ return n;
+}
+
+static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents,
+ uint64_t unique_capacity)
{
double perc, ratio;
+ const char *unit;
+ uint64_t uc_human;
- printf("Extents=%lu, Unique extents=%lu\n", (unsigned long) nextents, (unsigned long) nchunks);
+ printf("Extents=%lu, Unique extents=%lu", (unsigned long) nextents,
+ (unsigned long) nchunks);
+ if (!bloom)
+ printf(" Duplicated extents=%lu", (unsigned long) ndupextents);
+ printf("\n");
if (nchunks) {
ratio = (double) nextents / (double) nchunks;
printf("De-dupe ratio: 1:%3.2f\n", ratio - 1.0);
- } else
+ } else {
printf("De-dupe ratio: 1:infinite\n");
+ }
+
+ if (ndupextents) {
+ printf("De-dupe working set at least: %3.2f%%\n",
+ 100.0 * (double) ndupextents / (double) nextents);
+ }
perc = 1.00 - ((double) nchunks / (double) nextents);
perc *= 100.0;
printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
+
+ if (compression) {
+ uc_human = bytes_to_human_readable_unit(unique_capacity, &unit);
+ printf("Unique capacity %lu%s\n", (unsigned long) uc_human, unit);
+ }
}
-static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
+static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks, uint64_t *ndupextents)
{
struct fio_rb_node *n;
-
- *nchunks = *nextents = 0;
+ *nchunks = *nextents = *ndupextents = 0;
n = rb_first(&rb_root);
if (!n)
c = rb_entry(n, struct chunk, rb_node);
(*nchunks)++;
*nextents += c->count;
+ *ndupextents += (c->count > 1);
if (dump_output)
show_chunk(c);
log_err("\t-c\tFull collision check\n");
log_err("\t-B\tUse probabilistic bloom filter\n");
log_err("\t-p\tPrint progress indicator\n");
+ log_err("\t-C\tCalculate compressible size\n");
return 1;
}
int main(int argc, char *argv[])
{
- uint64_t nextents = 0, nchunks = 0;
+ uint64_t nextents = 0, nchunks = 0, ndupextents = 0, unique_capacity;
int c, ret;
arch_init(argv);
debug_init();
- while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
+ while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:C:")) != -1) {
switch (c) {
case 'b':
blocksize = atoi(optarg);
case 'B':
use_bloom = atoi(optarg);
break;
+ case 'C':
+ compression = atoi(optarg);
+ break;
case '?':
default:
return usage(argv);
}
}
- if (collision_check || dump_output)
+ if (collision_check || dump_output || compression)
use_bloom = 0;
if (!num_threads)
rb_root = RB_ROOT;
rb_lock = fio_sem_init(FIO_SEM_UNLOCKED);
- ret = dedupe_check(argv[optind], &nextents, &nchunks);
+ ret = dedupe_check(argv[optind], &nextents, &nchunks, &unique_capacity);
if (!ret) {
if (!bloom)
- iter_rb_tree(&nextents, &nchunks);
+ iter_rb_tree(&nextents, &nchunks, &ndupextents);
- show_stat(nextents, nchunks);
+ show_stat(nextents, nchunks, ndupextents, unique_capacity);
}
fio_sem_remove(rb_lock);