d315369a852030c079f3a3e6c73cb8515c423092
[fio.git] / t / genzipf.c
1 /*
2  * Generate/analyze pareto/zipf distributions to better understand
3  * what an access pattern would look like.
4  *
5  * For instance, the following would generate a zipf distribution
6  * with theta 1.2, using 262144 (1 GB / 4096) values and split the reporting into
7  * 20 buckets:
8  *
9  *      ./t/fio-genzipf -t zipf -i 1.2 -g 1 -b 4096 -o 20
10  *
11  * Only the distribution type (zipf or pareto) and spread input need
12  * to be given, if not given defaults are used.
13  *
14  */
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <fcntl.h>
18 #include <string.h>
19 #include <unistd.h>
20
21 #include "../lib/zipf.h"
22 #include "../lib/gauss.h"
23 #include "../flist.h"
24 #include "../hash.h"
25
26 #define DEF_NR_OUTPUT   20
27
28 struct node {
29         struct flist_head list;
30         unsigned long long val;
31         unsigned long hits;
32 };
33
34 static struct flist_head *hash;
35 static unsigned long hash_bits = 24;
36 static unsigned long hash_size = 1 << 24;
37
38 enum {
39         TYPE_NONE = 0,
40         TYPE_ZIPF,
41         TYPE_PARETO,
42         TYPE_NORMAL,
43 };
44 static const char *dist_types[] = { "None", "Zipf", "Pareto", "Normal" };
45
46 static int dist_type = TYPE_ZIPF;
47 static unsigned long gb_size = 500;
48 static unsigned long block_size = 4096;
49 static unsigned long output_nranges = DEF_NR_OUTPUT;
50 static double percentage;
51 static double dist_val;
52 static int output_csv = 0;
53
54 #define DEF_ZIPF_VAL    1.2
55 #define DEF_PARETO_VAL  0.3
56
57 static struct node *hash_lookup(unsigned long long val)
58 {
59         struct flist_head *l = &hash[hash_long(val, hash_bits)];
60         struct flist_head *entry;
61         struct node *n;
62
63         flist_for_each(entry, l) {
64                 n = flist_entry(entry, struct node, list);
65                 if (n->val == val)
66                         return n;
67         }
68
69         return NULL;
70 }
71
72 static struct node *hash_insert(struct node *n, unsigned long long val)
73 {
74         struct flist_head *l = &hash[hash_long(val, hash_bits)];
75
76         n->val = val;
77         n->hits = 1;
78         flist_add_tail(&n->list, l);
79         return n;
80 }
81
82 static void usage(void)
83 {
84         printf("genzipf: test zipf/pareto values for fio input\n");
85         printf("\t-h\tThis help screen\n");
86         printf("\t-p\tGenerate size of data set that are hit by this percentage\n");
87         printf("\t-t\tDistribution type (zipf, pareto, or normal)\n");
88         printf("\t-i\tDistribution algorithm input (zipf theta, pareto power,\n"
89                 "\t\tor normal %% deviation)\n");
90         printf("\t-b\tBlock size of a given range (in bytes)\n");
91         printf("\t-g\tSize of data set (in gigabytes)\n");
92         printf("\t-o\tNumber of output rows\n");
93         printf("\t-c\tOutput ranges in CSV format\n");
94 }
95
96 static int parse_options(int argc, char *argv[])
97 {
98         const char *optstring = "t:g:i:o:b:p:ch";
99         int c, dist_val_set = 0;
100
101         while ((c = getopt(argc, argv, optstring)) != -1) {
102                 switch (c) {
103                 case 'h':
104                         usage();
105                         return 1;
106                 case 'p':
107                         percentage = atof(optarg);
108                         break;
109                 case 'b':
110                         block_size = strtoul(optarg, NULL, 10);
111                         break;
112                 case 't':
113                         if (!strncmp(optarg, "zipf", 4))
114                                 dist_type = TYPE_ZIPF;
115                         else if (!strncmp(optarg, "pareto", 6))
116                                 dist_type = TYPE_PARETO;
117                         else if (!strncmp(optarg, "normal", 6))
118                                 dist_type = TYPE_NORMAL;
119                         else {
120                                 printf("wrong dist type: %s\n", optarg);
121                                 return 1;
122                         }
123                         break;
124                 case 'g':
125                         gb_size = strtoul(optarg, NULL, 10);
126                         break;
127                 case 'i':
128                         dist_val = atof(optarg);
129                         dist_val_set = 1;
130                         break;
131                 case 'o':
132                         output_nranges = strtoul(optarg, NULL, 10);
133                         break;
134                 case 'c':
135                         output_csv = 1;
136                         break;
137                 default:
138                         printf("bad option %c\n", c);
139                         return 1;
140                 }
141         }
142
143         if (dist_type == TYPE_PARETO) {
144                 if ((dist_val >= 1.00 || dist_val < 0.00)) {
145                         printf("pareto input must be > 0.00 and < 1.00\n");
146                         return 1;
147                 }
148                 if (!dist_val_set)
149                         dist_val = DEF_PARETO_VAL;
150         } else if (dist_type == TYPE_ZIPF) {
151                 if (dist_val == 1.0) {
152                         printf("zipf input must be different than 1.0\n");
153                         return 1;
154                 }
155                 if (!dist_val_set)
156                         dist_val = DEF_ZIPF_VAL;
157         }
158
159         return 0;
160 }
161
162 struct output_sum {
163         double output;
164         unsigned int nranges;
165 };
166
167 static int node_cmp(const void *p1, const void *p2)
168 {
169         const struct node *n1 = p1;
170         const struct node *n2 = p2;
171
172         return n2->hits - n1->hits;
173 }
174
175 int main(int argc, char *argv[])
176 {
177         unsigned long offset;
178         unsigned long i, j, k, nr_vals, cur_vals, interval_step, next_interval, total_vals, nnodes;
179         unsigned long long nranges;
180         struct output_sum *output_sums;
181         struct node *nodes;
182         double perc, perc_i;
183         struct zipf_state zs;
184         struct gauss_state gs;
185
186         if (parse_options(argc, argv))
187                 return 1;
188
189         if( !output_csv )
190                 printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size);
191
192         nranges = gb_size * 1024 * 1024 * 1024ULL;
193         nranges /= block_size;
194
195         if (dist_type == TYPE_ZIPF)
196                 zipf_init(&zs, nranges, dist_val, 1);
197         else if (dist_type == TYPE_PARETO)
198                 pareto_init(&zs, nranges, dist_val, 1);
199         else
200                 gauss_init(&gs, nranges, dist_val, 1);
201
202         hash_bits = 0;
203         hash_size = nranges;
204         while ((hash_size >>= 1) != 0)
205                 hash_bits++;
206
207         hash_size = 1 << hash_bits;
208
209         hash = malloc(hash_size * sizeof(struct flist_head));
210         for (i = 0; i < hash_size; i++)
211                 INIT_FLIST_HEAD(&hash[i]);
212
213         nodes = malloc(nranges * sizeof(struct node));
214
215         for (i = j = 0; i < nranges; i++) {
216                 struct node *n;
217
218                 if (dist_type == TYPE_ZIPF)
219                         offset = zipf_next(&zs);
220                 else if (dist_type == TYPE_PARETO)
221                         offset = pareto_next(&zs);
222                 else
223                         offset = gauss_next(&gs);
224
225                 n = hash_lookup(offset);
226                 if (n)
227                         n->hits++;
228                 else {
229                         hash_insert(&nodes[j], offset);
230                         j++;
231                 }
232
233         }
234
235         qsort(nodes, j, sizeof(struct node), node_cmp);
236         nnodes = j;
237         nr_vals = nnodes;
238
239         if (output_csv) {
240                 printf("rank, count\n");
241                 for (k = 0; k < nnodes; k++) {
242                         printf("%lu, %lu\n", k, nodes[k].hits);
243                 }
244         } else {
245                 unsigned long blocks = percentage * nranges / 100;
246                 double hit_percent_sum = 0;
247                 unsigned long long hit_sum = 0;
248                 interval_step = (nr_vals - 1) / output_nranges + 1;
249                 next_interval = interval_step;
250                 output_sums = malloc(output_nranges * sizeof(struct output_sum));
251                 for (i = 0; i < output_nranges; i++) {
252                         output_sums[i].output = 0.0;
253                         output_sums[i].nranges = 0;
254                 }
255
256                 j = total_vals = cur_vals = 0;
257
258                 for (k = 0; k < nnodes; k++) {
259                         struct output_sum *os = &output_sums[j];
260                         struct node *node = &nodes[k];
261                         cur_vals += node->hits;
262                         total_vals += node->hits;
263                         os->nranges += node->hits;
264                         if (k == (next_interval)  -1 || k == (nnodes - 1)) {
265                                 os->output = (double)(cur_vals) / (double)nranges;
266                                 os->output *= 100.0;
267                                 cur_vals = 0;
268                                 next_interval += interval_step;
269                                 j++;
270                         }
271
272                         if (percentage) {
273                                 if (total_vals >= blocks) {
274                                         double cs = k * block_size / (1024 * 1024);
275                                         char p = 'M';
276
277                                         if (cs > 1024.0) {
278                                                 cs /= 1024.0;
279                                                 p = 'G';
280                                         }
281                                         if (cs > 1024.0) {
282                                                 cs /= 1024.0;
283                                                 p = 'T';
284                                         }
285
286                                         printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p);
287                                         percentage = 0.0;
288                                 }
289                         }
290                 }
291
292                 perc_i = 100.0 / (double)output_nranges;
293                 perc = 0.0;
294
295                 printf("\n   Rows           Hits %%         No Hits         Size\n");
296                 printf("--------------------------------------------------------\n");
297                 for (i = 0; i < output_nranges; i++) {
298                         struct output_sum *os = &output_sums[i];
299                         double gb = (double)os->nranges * block_size / 1024.0;
300                         char p = 'K';
301
302                         if (gb > 1024.0) {
303                                 p = 'M';
304                                 gb /= 1024.0;
305                         }
306                         if (gb > 1024.0) {
307                                 p = 'G';
308                                 gb /= 1024.0;
309                         }
310
311                         perc += perc_i;
312                         hit_percent_sum += os->output;
313                         hit_sum += os->nranges;
314                         printf("%s %6.2f%%\t%6.2f%%\t\t%8u\t%6.2f%c\n",
315                                i ? "|->" : "Top", perc, os->output, os->nranges,
316                                gb, p);
317                 }
318                 printf("--------------------------------------------------------\n");
319                 printf("Total\t\t%6.2f%%\t\t%8llu\n", hit_percent_sum, hit_sum);
320                 free(output_sums);
321         }
322
323         free(hash);
324         free(nodes);
325         return 0;
326 }