X-Git-Url: https://git.kernel.dk/?a=blobdiff_plain;f=t%2Fgenzipf.c;h=4fc10ae72d8421e3407d0239d0f96c2257529d59;hb=5031b5cf997db16c9e3d95eaba8587a3beed892a;hp=adf914e235d184659d3ea58c0ab9f2cf54a7c088;hpb=0779dfc832be080f8f7ac2668cdeca13c18781f9;p=fio.git

diff --git a/t/genzipf.c b/t/genzipf.c
index adf914e2..4fc10ae7 100644
--- a/t/genzipf.c
+++ b/t/genzipf.c
@@ -3,108 +3,347 @@
  * what an access pattern would look like.
  *
  * For instance, the following would generate a zipf distribution
- * with theta 1.2, using 100,000 values and split the reporting into
- * 20 buckets:
+ * with theta 1.2, using 262144 (1 GiB / 4096) values and split the
+ * reporting into 20 buckets:
  *
- *	t/genzipf zipf 1.2 100000 20
+ *	./t/fio-genzipf -t zipf -i 1.2 -g 1 -b 4096 -o 20
+ *
+ * Only the distribution type (zipf or pareto) and spread input need
+ * to be given, if not given defaults are used.
  *
  */
 #include <stdio.h>
 #include <stdlib.h>
-#include <fcntl.h>
 #include <string.h>
+#include <unistd.h>
 
 #include "../lib/zipf.h"
+#include "../lib/gauss.h"
+#include "../flist.h"
+#include "../hash.h"
+
+#define DEF_NR_OUTPUT	20
+
+struct node {
+	struct flist_head list;
+	unsigned long long val;
+	unsigned long hits;
+};
+
+static struct flist_head *hash;
+static unsigned long hash_bits = 24;
+static unsigned long hash_size = 1 << 24;
+
+enum {
+	TYPE_NONE = 0,
+	TYPE_ZIPF,
+	TYPE_PARETO,
+	TYPE_NORMAL,
+};
+static const char *dist_types[] = { "None", "Zipf", "Pareto", "Normal" };
 
-static int val_cmp(const void *p1, const void *p2)
+enum {
+	OUTPUT_NORMAL,
+	OUTPUT_CSV,
+};
+
+static int dist_type = TYPE_ZIPF;
+static unsigned long gib_size = 500;
+static unsigned long block_size = 4096;
+static unsigned long output_nranges = DEF_NR_OUTPUT;
+static double percentage;
+static double dist_val;
+static int output_type = OUTPUT_NORMAL;
+
+#define DEF_ZIPF_VAL	1.2
+#define DEF_PARETO_VAL	0.3
+
+static unsigned int hashv(unsigned long long val)
 {
-	const unsigned long *v1 = p1;
-	const unsigned long *v2 = p2;
+	return jhash(&val, sizeof(val), 0) & (hash_size - 1);
+}
 
-	return *v1 - *v2;
+static struct node *hash_lookup(unsigned long long val)
+{
+	struct flist_head *l = &hash[hashv(val)];
+	struct flist_head *entry;
+	struct node *n;
+
+	flist_for_each(entry, l) {
+		n = flist_entry(entry, struct node, list);
+		if (n->val == val)
+			return n;
+	}
+
+	return NULL;
 }
 
-int main(int argc, char *argv[])
+static void hash_insert(struct node *n, unsigned long long val)
 {
-	unsigned long nranges, output_nranges;
-	unsigned long *vals;
-	unsigned long i, j, nr_vals, cur_vals, max_val, interval;
-	double *output;
-	struct zipf_state zs;
-	int use_zipf;
-	double val;
+	struct flist_head *l = &hash[hashv(val)];
 
-	if (argc < 4) {
-		printf("%s: {zipf,pareto} val values [output ranges]\n", argv[0]);
-		return 1;
+	n->val = val;
+	n->hits = 1;
+	flist_add_tail(&n->list, l);
+}
+
+static void usage(void)
+{
+	printf("genzipf: test zipf/pareto values for fio input\n");
+	printf("\t-h\tThis help screen\n");
+	printf("\t-p\tGenerate size of data set that are hit by this percentage\n");
+	printf("\t-t\tDistribution type (zipf, pareto, or normal)\n");
+	printf("\t-i\tDistribution algorithm input (zipf theta, pareto power,\n"
+		"\t\tor normal %% deviation)\n");
+	printf("\t-b\tBlock size of a given range (in bytes)\n");
+	printf("\t-g\tSize of data set (in gigabytes)\n");
+	printf("\t-o\tNumber of output rows\n");
+	printf("\t-c\tOutput ranges in CSV format\n");
+}
+
+static int parse_options(int argc, char *argv[])
+{
+	const char *optstring = "t:g:i:o:b:p:ch";
+	int c, dist_val_set = 0;
+
+	while ((c = getopt(argc, argv, optstring)) != -1) {
+		switch (c) {
+		case 'h':
+			usage();
+			return 1;
+		case 'p':
+			percentage = atof(optarg);
+			break;
+		case 'b':
+			block_size = strtoul(optarg, NULL, 10);
+			break;
+		case 't':
+			if (!strncmp(optarg, "zipf", 4))
+				dist_type = TYPE_ZIPF;
+			else if (!strncmp(optarg, "pareto", 6))
+				dist_type = TYPE_PARETO;
+			else if (!strncmp(optarg, "normal", 6))
+				dist_type = TYPE_NORMAL;
+			else {
+				printf("wrong dist type: %s\n", optarg);
+				return 1;
+			}
+			break;
+		case 'g':
+			gib_size = strtoul(optarg, NULL, 10);
+			break;
+		case 'i':
+			dist_val = atof(optarg);
+			dist_val_set = 1;
+			break;
+		case 'o':
+			output_nranges = strtoul(optarg, NULL, 10);
+			break;
+		case 'c':
+			output_type = OUTPUT_CSV;
+			break;
+		default:
+			printf("bad option %c\n", c);
+			return 1;
+		}
 	}
 
-	if (!strcmp(argv[1], "zipf"))
-		use_zipf = 1;
-	else if (!strcmp(argv[1], "pareto"))
-		use_zipf = 0;
-	else {
-		printf("Bad distribution type <%s>\n", argv[1]);
-		return 1;
+	if (dist_type == TYPE_PARETO) {
+		if ((dist_val >= 1.00 || dist_val < 0.00)) {
+			printf("pareto input must be > 0.00 and < 1.00\n");
+			return 1;
+		}
+		if (!dist_val_set)
+			dist_val = DEF_PARETO_VAL;
+	} else if (dist_type == TYPE_ZIPF) {
+		if (dist_val == 1.0) {
+			printf("zipf input must be different than 1.0\n");
+			return 1;
+		}
+		if (!dist_val_set)
+			dist_val = DEF_ZIPF_VAL;
 	}
 
-	val = atof(argv[2]);
-	nranges = strtoul(argv[3], NULL, 10);
-	if (argc == 5)
-		output_nranges = strtoul(argv[4], NULL, 10);
-	else
-		output_nranges = nranges;
+	return 0;
+}
 
-	printf("Generating %s distribution with %f input and %lu ranges\n", use_zipf ? "zipf" : "pareto", val, nranges);
-	getchar();
+struct output_sum {
+	double output;
+	unsigned int nranges;
+};
 
-	if (use_zipf)
-		zipf_init(&zs, nranges, val);
-	else
-		pareto_init(&zs, nranges, val);
+static int node_cmp(const void *p1, const void *p2)
+{
+	const struct node *n1 = p1;
+	const struct node *n2 = p2;
 
-	vals = malloc(nranges * sizeof(unsigned long));
+	return n2->hits - n1->hits;
+}
 
-	max_val = nr_vals = 0;
-	for (i = 0; i < nranges; i++) {
-		if (use_zipf)
-			vals[nr_vals] = zipf_next(&zs);
-		else
-			vals[nr_vals] = pareto_next(&zs);
+static void output_csv(struct node *nodes, unsigned long nnodes)
+{
+	unsigned long i;
 
-		if (vals[nr_vals] > max_val)
-			max_val = vals[nr_vals];
-		nr_vals++;
-	}
+	printf("rank, count\n");
+	for (i = 0; i < nnodes; i++)
+		printf("%lu, %lu\n", i, nodes[i].hits);
+}
+
+static void output_normal(struct node *nodes, unsigned long nnodes,
+			  unsigned long nranges)
+{
+	unsigned long i, j, cur_vals, interval_step, next_interval, total_vals;
+	unsigned long blocks = percentage * nnodes / 100;
+	double hit_percent_sum = 0;
+	unsigned long long hit_sum = 0;
+	double perc, perc_i;
+	struct output_sum *output_sums;
 
-	qsort(vals, nr_vals, sizeof(unsigned long), val_cmp);
+	interval_step = (nnodes - 1) / output_nranges + 1;
+	next_interval = interval_step;
+	output_sums = malloc(output_nranges * sizeof(struct output_sum));
 
-	interval = (max_val + output_nranges - 1) / output_nranges;
+	for (i = 0; i < output_nranges; i++) {
+		output_sums[i].output = 0.0;
+		output_sums[i].nranges = 0;
+	}
 
-	output = malloc(output_nranges * sizeof(double));
+	j = total_vals = cur_vals = 0;
 
-	for (i = j = 0, cur_vals = 0; i < nr_vals; i++) {
-		if (vals[i] > interval) {
-			output[j] = (double) (cur_vals + 1) / (double) nr_vals;
-			output[j] *= 100.0;
-			j++;
+	for (i = 0; i < nnodes; i++) {
+		struct output_sum *os = &output_sums[j];
+		struct node *node = &nodes[i];
+		cur_vals += node->hits;
+		total_vals += node->hits;
+		os->nranges += node->hits;
+		if (i == (next_interval) -1 || i == nnodes - 1) {
+			os->output = (double) cur_vals / (double) nranges;
+			os->output *= 100.0;
 			cur_vals = 0;
-			interval += (max_val + output_nranges - 1) / output_nranges;
-			continue;
+			next_interval += interval_step;
+			j++;
+		}
+
+		if (percentage) {
+			if (total_vals >= blocks) {
+				double cs = (double) i * block_size / (1024.0 * 1024.0);
+				char p = 'M';
+
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'G';
+				}
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'T';
+				}
+
+				printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p);
+				percentage = 0.0;
+			}
 		}
-		cur_vals++;
 	}
 
-	if (cur_vals) {
-		output[j] = (double) (cur_vals + 1) / (double) nr_vals;
-		output[j] *= 100.0;
-		j++;
+	perc_i = 100.0 / (double)output_nranges;
+	perc = 0.0;
+
+	printf("\n   Rows           Hits %%         Sum %%           # Hits          Size\n");
+	printf("-----------------------------------------------------------------------\n");
+	for (i = 0; i < output_nranges; i++) {
+		struct output_sum *os = &output_sums[i];
+		double gb = (double)os->nranges * block_size / 1024.0;
+		char p = 'K';
+
+		if (gb > 1024.0) {
+			p = 'M';
+			gb /= 1024.0;
+		}
+		if (gb > 1024.0) {
+			p = 'G';
+			gb /= 1024.0;
+		}
+
+		perc += perc_i;
+		hit_percent_sum += os->output;
+		hit_sum += os->nranges;
+		printf("%s %6.2f%%\t%6.2f%%\t\t%6.2f%%\t\t%8u\t%6.2f%c\n",
+			i ? "|->" : "Top", perc, os->output, hit_percent_sum,
+			os->nranges, gb, p);
+	}
+
+	printf("-----------------------------------------------------------------------\n");
+	printf("Total\t\t\t\t\t\t%8llu\n", hit_sum);
+	free(output_sums);
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long offset;
+	unsigned long long nranges;
+	unsigned long nnodes;
+	struct node *nodes;
+	struct zipf_state zs;
+	struct gauss_state gs;
+	int i, j;
+
+	if (parse_options(argc, argv))
+		return 1;
+
+	if (output_type != OUTPUT_CSV)
+		printf("Generating %s distribution with %f input and %lu GiB size and %lu block_size.\n",
+		       dist_types[dist_type], dist_val, gib_size, block_size);
+
+	nranges = gib_size * 1024 * 1024 * 1024ULL;
+	nranges /= block_size;
+
+	if (dist_type == TYPE_ZIPF)
+		zipf_init(&zs, nranges, dist_val, 1);
+	else if (dist_type == TYPE_PARETO)
+		pareto_init(&zs, nranges, dist_val, 1);
+	else
+		gauss_init(&gs, nranges, dist_val, 1);
+
+	hash_bits = 0;
+	hash_size = nranges;
+	while ((hash_size >>= 1) != 0)
+		hash_bits++;
+
+	hash_size = 1 << hash_bits;
+
+	hash = calloc(hash_size, sizeof(struct flist_head));
+	for (i = 0; i < hash_size; i++)
+		INIT_FLIST_HEAD(&hash[i]);
+
+	nodes = malloc(nranges * sizeof(struct node));
+
+	for (i = j = 0; i < nranges; i++) {
+		struct node *n;
+
+		if (dist_type == TYPE_ZIPF)
+			offset = zipf_next(&zs);
+		else if (dist_type == TYPE_PARETO)
+			offset = pareto_next(&zs);
+		else
+			offset = gauss_next(&gs);
+
+		n = hash_lookup(offset);
+		if (n)
+			n->hits++;
+		else {
+			hash_insert(&nodes[j], offset);
+			j++;
+		}
 	}
 
-	for (i = 0; i < j; i++)
-		printf("%.2f%%\n", output[i]);
+	qsort(nodes, j, sizeof(struct node), node_cmp);
+	nnodes = j;
+
+	if (output_type == OUTPUT_CSV)
+		output_csv(nodes, nnodes);
+	else
+		output_normal(nodes, nnodes, nranges);
 
-	free(output);
-	free(vals);
+	free(hash);
+	free(nodes);
 	return 0;
 }