X-Git-Url: https://git.kernel.dk/?a=blobdiff_plain;f=t%2Fgenzipf.c;h=4fc10ae72d8421e3407d0239d0f96c2257529d59;hb=c7b42e0a8c3d6fb179cc7c681fcee58623de8b2f;hp=4ba940cfefeb60f69045ca9f14f8ea20e1fc4a5b;hpb=444256eaf4f0ffdc4cacef37873d58f2a65bf8e6;p=fio.git

diff --git a/t/genzipf.c b/t/genzipf.c
index 4ba940cf..4fc10ae7 100644
--- a/t/genzipf.c
+++ b/t/genzipf.c
@@ -3,10 +3,10 @@
  * what an access pattern would look like.
  *
  * For instance, the following would generate a zipf distribution
- * with theta 1.2, using 100,000 values and split the reporting into
- * 20 buckets:
+ * with theta 1.2, using 262144 (1 GiB / 4096) values and split the
+ * reporting into 20 buckets:
  *
- *	t/genzipf zipf 1.2 100000 20
+ *	./t/fio-genzipf -t zipf -i 1.2 -g 1 -b 4096 -o 20
  *
  * Only the distribution type (zipf or pareto) and spread input need
  * to be given, if not given defaults are used.
@@ -14,21 +14,18 @@
  */
 #include <stdio.h>
 #include <stdlib.h>
-#include <fcntl.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "../lib/zipf.h"
+#include "../lib/gauss.h"
 #include "../flist.h"
 #include "../hash.h"
-#include "../rbtree.h"
 
-#define DEF_NR		1000000
-#define DEF_NR_OUTPUT	23
+#define DEF_NR_OUTPUT	20
 
 struct node {
 	struct flist_head list;
-	struct rb_node rb;
 	unsigned long long val;
 	unsigned long hits;
 };
@@ -36,28 +33,39 @@ struct node {
 static struct flist_head *hash;
 static unsigned long hash_bits = 24;
 static unsigned long hash_size = 1 << 24;
-static struct rb_root rb;
 
 enum {
 	TYPE_NONE = 0,
 	TYPE_ZIPF,
 	TYPE_PARETO,
+	TYPE_NORMAL,
+};
+static const char *dist_types[] = { "None", "Zipf", "Pareto", "Normal" };
+
+enum {
+	OUTPUT_NORMAL,
+	OUTPUT_CSV,
 };
-static const char *dist_types[] = { "None", "Zipf", "Pareto" };
 
 static int dist_type = TYPE_ZIPF;
-static unsigned long gb_size = 500;
+static unsigned long gib_size = 500;
 static unsigned long block_size = 4096;
 static unsigned long output_nranges = DEF_NR_OUTPUT;
 static double percentage;
 static double dist_val;
+static int output_type = OUTPUT_NORMAL;
 
 #define DEF_ZIPF_VAL	1.2
 #define DEF_PARETO_VAL	0.3
 
+static unsigned int hashv(unsigned long long val)
+{
+	return jhash(&val, sizeof(val), 0) & (hash_size - 1);
+}
+
 static struct node *hash_lookup(unsigned long long val)
 {
-	struct flist_head *l = &hash[hash_long(val, hash_bits)];
+	struct flist_head *l = &hash[hashv(val)];
 	struct flist_head *entry;
 	struct node *n;
 
@@ -70,72 +78,39 @@ static struct node *hash_lookup(unsigned long long val)
 	return NULL;
 }
 
-static void hash_insert(unsigned long long val)
+static void hash_insert(struct node *n, unsigned long long val)
 {
-	struct flist_head *l = &hash[hash_long(val, hash_bits)];
-	struct node *n = malloc(sizeof(*n));
+	struct flist_head *l = &hash[hashv(val)];
 
 	n->val = val;
 	n->hits = 1;
 	flist_add_tail(&n->list, l);
 }
 
-static void rb_insert(struct node *n)
-{
-	struct rb_node **p, *parent;
-
-	memset(&n->rb, 0, sizeof(n->rb));
-	p = &rb.rb_node;
-	parent = NULL;
-	while (*p) {
-		struct node *__n;
-
-		parent = *p;
-		__n = rb_entry(parent, struct node, rb);
-		if (n->hits > __n->hits)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&n->rb, parent, p);
-	rb_insert_color(&n->rb, &rb);
-}
-
-static unsigned long rb_add(struct flist_head *list)
-{
-	struct flist_head *entry;
-	unsigned long ret = 0;
-	struct node *n;
-
-	flist_for_each(entry, list) {
-		n = flist_entry(entry, struct node, list);
-
-		rb_insert(n);
-		ret++;
-	}
-
-	return ret;
-}
-
-static unsigned long rb_gen(void)
+static void usage(void)
 {
-	unsigned long ret = 0;
-	unsigned int i;
-
-	for (i = 0; i < hash_size; i++)
-		ret += rb_add(&hash[i]);
-
-	return ret;
+	printf("genzipf: test zipf/pareto values for fio input\n");
+	printf("\t-h\tThis help screen\n");
+	printf("\t-p\tGenerate size of data set that are hit by this percentage\n");
+	printf("\t-t\tDistribution type (zipf, pareto, or normal)\n");
+	printf("\t-i\tDistribution algorithm input (zipf theta, pareto power,\n"
+		"\t\tor normal %% deviation)\n");
+	printf("\t-b\tBlock size of a given range (in bytes)\n");
+	printf("\t-g\tSize of data set (in gigabytes)\n");
+	printf("\t-o\tNumber of output rows\n");
+	printf("\t-c\tOutput ranges in CSV format\n");
 }
 
 static int parse_options(int argc, char *argv[])
 {
-	const char *optstring = "t:g:i:o:b:p:";
+	const char *optstring = "t:g:i:o:b:p:ch";
 	int c, dist_val_set = 0;
 
 	while ((c = getopt(argc, argv, optstring)) != -1) {
 		switch (c) {
+		case 'h':
+			usage();
+			return 1;
 		case 'p':
 			percentage = atof(optarg);
 			break;
@@ -147,13 +122,15 @@ static int parse_options(int argc, char *argv[])
 				dist_type = TYPE_ZIPF;
 			else if (!strncmp(optarg, "pareto", 6))
 				dist_type = TYPE_PARETO;
+			else if (!strncmp(optarg, "normal", 6))
+				dist_type = TYPE_NORMAL;
 			else {
 				printf("wrong dist type: %s\n", optarg);
 				return 1;
 			}
 			break;
 		case 'g':
-			gb_size = strtoul(optarg, NULL, 10);
+			gib_size = strtoul(optarg, NULL, 10);
 			break;
 		case 'i':
 			dist_val = atof(optarg);
@@ -162,6 +139,9 @@ static int parse_options(int argc, char *argv[])
 		case 'o':
 			output_nranges = strtoul(optarg, NULL, 10);
 			break;
+		case 'c':
+			output_type = OUTPUT_CSV;
+			break;
 		default:
 			printf("bad option %c\n", c);
 			return 1;
@@ -192,93 +172,61 @@ struct output_sum {
 	unsigned int nranges;
 };
 
-int main(int argc, char *argv[])
+static int node_cmp(const void *p1, const void *p2)
 {
-	unsigned long offset;
-	unsigned long i, j, nr_vals, cur_vals, interval, total_vals;
-	unsigned long long nranges;
-	struct output_sum *output_sums;
-	double perc, perc_i;
-	struct zipf_state zs;
-	struct rb_node *n;
-
-	if (parse_options(argc, argv))
-		return 1;
-
-	printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size);
-
-	nranges = gb_size * 1024 * 1024 * 1024ULL;
-	nranges /= block_size;
-
-	if (dist_type == TYPE_ZIPF)
-		zipf_init(&zs, nranges, dist_val, 1);
-	else
-		pareto_init(&zs, nranges, dist_val, 1);
-
-	hash_bits = 0;
-	hash_size = nranges;
-	while ((hash_size >>= 1) != 0)
-		hash_bits++;
-
-	hash_size = 1 << hash_bits;
-
-	hash = malloc(hash_size * sizeof(struct flist_head));
-	for (i = 0; i < hash_size; i++)
-		INIT_FLIST_HEAD(&hash[i]);
-
-	for (nr_vals = i = 0; i < nranges; i++) {
-		struct node *n;
+	const struct node *n1 = p1;
+	const struct node *n2 = p2;
 
-		if (dist_type == TYPE_ZIPF)
-			offset = zipf_next(&zs);
-		else
-			offset = pareto_next(&zs);
-
-		n = hash_lookup(offset);
-		if (n)
-			n->hits++;
-		else
-			hash_insert(offset);
+	return n2->hits - n1->hits;
+}
 
-		nr_vals++;
-	}
+static void output_csv(struct node *nodes, unsigned long nnodes)
+{
+	unsigned long i;
 
-	nr_vals = rb_gen();
+	printf("rank, count\n");
+	for (i = 0; i < nnodes; i++)
+		printf("%lu, %lu\n", i, nodes[i].hits);
+}
 
-	interval = (nr_vals + output_nranges - 1) / output_nranges;
+static void output_normal(struct node *nodes, unsigned long nnodes,
+			  unsigned long nranges)
+{
+	unsigned long i, j, cur_vals, interval_step, next_interval, total_vals;
+	unsigned long blocks = percentage * nnodes / 100;
+	double hit_percent_sum = 0;
+	unsigned long long hit_sum = 0;
+	double perc, perc_i;
+	struct output_sum *output_sums;
 
+	interval_step = (nnodes - 1) / output_nranges + 1;
+	next_interval = interval_step;
 	output_sums = malloc(output_nranges * sizeof(struct output_sum));
+
 	for (i = 0; i < output_nranges; i++) {
 		output_sums[i].output = 0.0;
-		output_sums[i].nranges = 1;
+		output_sums[i].nranges = 0;
 	}
 
-	total_vals = i = j = cur_vals = 0;
-	
-	n = rb_first(&rb);
-	while (n) {
-		struct node *node = rb_entry(n, struct node, rb);
-		struct output_sum *os = &output_sums[j];
+	j = total_vals = cur_vals = 0;
 
-		if (i >= interval) {
-			os->output = (double) (cur_vals + 1) / (double) nranges;
+	for (i = 0; i < nnodes; i++) {
+		struct output_sum *os = &output_sums[j];
+		struct node *node = &nodes[i];
+		cur_vals += node->hits;
+		total_vals += node->hits;
+		os->nranges += node->hits;
+		if (i == (next_interval) -1 || i == nnodes - 1) {
+			os->output = (double) cur_vals / (double) nranges;
 			os->output *= 100.0;
+			cur_vals = 0;
+			next_interval += interval_step;
 			j++;
-			cur_vals = node->hits;
-			interval += (nr_vals + output_nranges - 1) / output_nranges;
-		} else {
-			cur_vals += node->hits;
-			os->nranges += node->hits;
 		}
 
-		i++;
-		total_vals += node->hits;
-
 		if (percentage) {
-			unsigned long blocks = percentage * nranges / 100;
-
 			if (total_vals >= blocks) {
-				double cs = i * block_size / (1024 * 1024);
+				double cs = (double) i * block_size / (1024.0 * 1024.0);
 				char p = 'M';
 
 				if (cs > 1024.0) {
@@ -294,18 +242,16 @@ int main(int argc, char *argv[])
 				percentage = 0.0;
 			}
 		}
-
-		n = rb_next(n);
 	}
 
-	perc_i = 100.0 / (double) output_nranges;
+	perc_i = 100.0 / (double)output_nranges;
 	perc = 0.0;
 
-	printf("\n   Rows           Hits           No Hits         Size\n");
-	printf("--------------------------------------------------------\n");
-	for (i = 0; i < j; i++) {
+	printf("\n   Rows           Hits %%         Sum %%           # Hits          Size\n");
+	printf("-----------------------------------------------------------------------\n");
+	for (i = 0; i < output_nranges; i++) {
 		struct output_sum *os = &output_sums[i];
-		double gb = (double) os->nranges * block_size / 1024.0;
+		double gb = (double)os->nranges * block_size / 1024.0;
 		char p = 'K';
 
 		if (gb > 1024.0) {
@@ -318,10 +264,86 @@ int main(int argc, char *argv[])
 		}
 
 		perc += perc_i;
-		printf("%s %6.2f%%\t%6.2f%%\t\t%8u\t%6.2f%c\n", i ? "|->" : "Top", perc, os->output, os->nranges, gb, p);
+		hit_percent_sum += os->output;
+		hit_sum += os->nranges;
+		printf("%s %6.2f%%\t%6.2f%%\t\t%6.2f%%\t\t%8u\t%6.2f%c\n",
+			i ? "|->" : "Top", perc, os->output, hit_percent_sum,
+			os->nranges, gb, p);
 	}
 
+	printf("-----------------------------------------------------------------------\n");
+	printf("Total\t\t\t\t\t\t%8llu\n", hit_sum);
 	free(output_sums);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long offset;
+	unsigned long long nranges;
+	unsigned long nnodes;
+	struct node *nodes;
+	struct zipf_state zs;
+	struct gauss_state gs;
+	int i, j;
+
+	if (parse_options(argc, argv))
+		return 1;
+
+	if (output_type != OUTPUT_CSV)
+		printf("Generating %s distribution with %f input and %lu GiB size and %lu block_size.\n",
+		       dist_types[dist_type], dist_val, gib_size, block_size);
+
+	nranges = gib_size * 1024 * 1024 * 1024ULL;
+	nranges /= block_size;
+
+	if (dist_type == TYPE_ZIPF)
+		zipf_init(&zs, nranges, dist_val, 1);
+	else if (dist_type == TYPE_PARETO)
+		pareto_init(&zs, nranges, dist_val, 1);
+	else
+		gauss_init(&gs, nranges, dist_val, 1);
+
+	hash_bits = 0;
+	hash_size = nranges;
+	while ((hash_size >>= 1) != 0)
+		hash_bits++;
+
+	hash_size = 1 << hash_bits;
+
+	hash = calloc(hash_size, sizeof(struct flist_head));
+	for (i = 0; i < hash_size; i++)
+		INIT_FLIST_HEAD(&hash[i]);
+
+	nodes = malloc(nranges * sizeof(struct node));
+
+	for (i = j = 0; i < nranges; i++) {
+		struct node *n;
+
+		if (dist_type == TYPE_ZIPF)
+			offset = zipf_next(&zs);
+		else if (dist_type == TYPE_PARETO)
+			offset = pareto_next(&zs);
+		else
+			offset = gauss_next(&gs);
+
+		n = hash_lookup(offset);
+		if (n)
+			n->hits++;
+		else {
+			hash_insert(&nodes[j], offset);
+			j++;
+		}
+	}
+
+	qsort(nodes, j, sizeof(struct node), node_cmp);
+	nnodes = j;
+
+	if (output_type == OUTPUT_CSV)
+		output_csv(nodes, nnodes);
+	else
+		output_normal(nodes, nnodes, nranges);
+
 	free(hash);
+	free(nodes);
 	return 0;
 }