genzipf: improve accuracy
authorJens Axboe <axboe@kernel.dk>
Wed, 7 Nov 2012 10:43:50 +0000 (11:43 +0100)
committerJens Axboe <axboe@kernel.dk>
Wed, 7 Nov 2012 10:43:50 +0000 (11:43 +0100)
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Makefile
t/genzipf.c

index fc3a795e4e0bd5aa44dd9449a1611c2cf3412bb4..f0f3ea0f00c03050cfb1435ecda14ef33e227f25 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -77,7 +77,7 @@ T_IEEE_OBJS += ieee754.o
 T_IEEE_PROGS = t/ieee754
 
 T_ZIPF_OBS = t/genzipf.o
 T_IEEE_PROGS = t/ieee754
 
 T_ZIPF_OBS = t/genzipf.o
-T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/zipf.o t/genzipf.o
+T_ZIPF_OBJS += rbtree.o t/log.o lib/ieee754.o lib/rand.o lib/zipf.o t/genzipf.o
 T_ZIPF_PROGS = t/genzip
 
 T_OBJS = $(T_SMALLOC_OBJS)
 T_ZIPF_PROGS = t/genzip
 
 T_OBJS = $(T_SMALLOC_OBJS)
index f28674424df13fdf15b16aed76ec488dff1e266a..ae88484b8128797dea4049d51acb2504f6ae2320 100644 (file)
 #include <string.h>
 
 #include "../lib/zipf.h"
 #include <string.h>
 
 #include "../lib/zipf.h"
+#include "../flist.h"
+#include "../hash.h"
+#include "../rbtree.h"
 
 #define DEF_NR         1000000
 #define DEF_NR_OUTPUT  23
 
 
 #define DEF_NR         1000000
 #define DEF_NR_OUTPUT  23
 
-static int val_cmp(const void *p1, const void *p2)
+struct node {
+       struct flist_head list;
+       struct rb_node rb;
+       unsigned long long val;
+       unsigned long hits;
+};
+
+static struct flist_head *hash;
+static unsigned long hash_bits = 24;
+static unsigned long hash_size = 1 << 24;
+static struct rb_root rb;
+
+static struct node *hash_lookup(unsigned long long val)
+{
+       struct flist_head *l = &hash[hash_long(val, hash_bits)];
+       struct flist_head *entry;
+       struct node *n;
+
+       flist_for_each(entry, l) {
+               n = flist_entry(entry, struct node, list);
+               if (n->val == val)
+                       return n;
+       }
+
+       return NULL;
+}
+
+static void hash_insert(unsigned long long val)
+{
+       struct flist_head *l = &hash[hash_long(val, hash_bits)];
+       struct node *n = malloc(sizeof(*n));
+
+       n->val = val;
+       n->hits = 1;
+       flist_add_tail(&n->list, l);
+}
+
+static void rb_insert(struct node *n)
+{
+       struct rb_node **p, *parent;
+
+       memset(&n->rb, 0, sizeof(n->rb));
+       p = &rb.rb_node;
+       parent = NULL;
+       while (*p) {
+               struct node *__n;
+
+               parent = *p;
+               __n = rb_entry(parent, struct node, rb);
+               if (n->hits > __n->hits)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       rb_link_node(&n->rb, parent, p);
+       rb_insert_color(&n->rb, &rb);
+}
+
+static unsigned long rb_add(struct flist_head *list)
 {
 {
-       const unsigned long *v1 = p1;
-       const unsigned long *v2 = p2;
+       struct flist_head *entry;
+       unsigned long ret = 0;
+       struct node *n;
+
+       flist_for_each(entry, list) {
+               n = flist_entry(entry, struct node, list);
+
+               rb_insert(n);
+               ret++;
+       }
 
 
-       return *v1 - *v2;
+       return ret;
+}
+
+static unsigned long rb_gen(void)
+{
+       unsigned long ret = 0;
+       unsigned int i;
+
+       for (i = 0; i < hash_size; i++)
+               ret += rb_add(&hash[i]);
+
+       return ret;
 }
 
 int main(int argc, char *argv[])
 {
        unsigned long nranges, output_nranges;
 }
 
 int main(int argc, char *argv[])
 {
        unsigned long nranges, output_nranges;
-       unsigned long *vals;
-       unsigned long i, j, nr_vals, cur_vals, max_val, interval;
+       unsigned long offset;
+       unsigned long i, j, nr_vals, cur_vals, interval;
        double *output, perc, perc_i;
        struct zipf_state zs;
        double *output, perc, perc_i;
        struct zipf_state zs;
+       struct rb_node *n;
        int use_zipf;
        double val;
 
        int use_zipf;
        double val;
 
@@ -71,52 +153,60 @@ int main(int argc, char *argv[])
        else
                pareto_init(&zs, nranges, val);
 
        else
                pareto_init(&zs, nranges, val);
 
-       vals = malloc(nranges * sizeof(unsigned long));
+       hash = malloc(hash_size * sizeof(struct flist_head));
+       for (i = 0; i < hash_size; i++)
+               INIT_FLIST_HEAD(&hash[i]);
+
+       for (nr_vals = 0, i = 0; i < nranges; i++) {
+               struct node *n;
 
 
-       max_val = nr_vals = 0;
-       for (i = 0; i < nranges; i++) {
                if (use_zipf)
                if (use_zipf)
-                       vals[nr_vals] = zipf_next(&zs);
+                       offset = zipf_next(&zs);
                else
                else
-                       vals[nr_vals] = pareto_next(&zs);
+                       offset = pareto_next(&zs);
+
+               n = hash_lookup(offset);
+               if (n)
+                       n->hits++;
+               else
+                       hash_insert(offset);
 
 
-               if (vals[nr_vals] > max_val)
-                       max_val = vals[nr_vals];
                nr_vals++;
        }
 
                nr_vals++;
        }
 
-       qsort(vals, nr_vals, sizeof(unsigned long), val_cmp);
+       nr_vals = rb_gen();
 
 
-       interval = (max_val + output_nranges - 1) / output_nranges;
+       interval = nr_vals / output_nranges;
 
        output = malloc(output_nranges * sizeof(double));
 
 
        output = malloc(output_nranges * sizeof(double));
 
-       for (i = j = 0, cur_vals = 0; i < nr_vals; i++) {
-               if (vals[i] > interval) {
-                       output[j] = (double) (cur_vals + 1) / (double) nr_vals;
+       i = j = cur_vals = 0;
+       
+       n = rb_first(&rb);
+       while (n) {
+               struct node *node = rb_entry(n, struct node, rb);
+
+               if (i >= interval) {
+                       output[j] = (double) (cur_vals + 1) / (double) nranges;
                        output[j] *= 100.0;
                        j++;
                        output[j] *= 100.0;
                        j++;
-                       cur_vals = 0;
-                       interval += (max_val + output_nranges - 1) / output_nranges;
-                       continue;
-               }
-               cur_vals++;
-       }
+                       cur_vals = node->hits;
+                       interval += nr_vals / output_nranges;
+               } else
+                       cur_vals += node->hits;
 
 
-       if (cur_vals) {
-               output[j] = (double) (cur_vals + 1) / (double) nr_vals;
-               output[j] *= 100.0;
-               j++;
+               n = rb_next(n);
+               i++;
        }
 
        perc_i = 100.0 / (double) output_nranges;
        perc = 0.0;
        for (i = 0; i < j; i++) {
        }
 
        perc_i = 100.0 / (double) output_nranges;
        perc = 0.0;
        for (i = 0; i < j; i++) {
-               printf("%6.2f%% -> %6.2f%%:\t%6.2f%%\n",  perc, perc + perc_i, output[i]);
                perc += perc_i;
                perc += perc_i;
+               printf("Top %6.2f%%:\t%6.2f%% of hits\n",  perc, output[i]);
        }
 
        free(output);
        }
 
        free(output);
-       free(vals);
+       free(hash);
        return 0;
 }
        return 0;
 }