zipf: use 64-bit safe hash for zipf/pareto
[fio.git] / t / genzipf.c
... / ...
CommitLineData
1/*
2 * Generate/analyze pareto/zipf distributions to better understand
3 * what an access pattern would look like.
4 *
5 * For instance, the following would generate a zipf distribution
6 * with theta 1.2, using 100,000 values and split the reporting into
7 * 20 buckets:
8 *
9 * t/genzipf zipf 1.2 100000 20
10 *
11 * Only the distribution type (zipf or pareto) and spread input need
12 * to be given, if not given defaults are used.
13 *
14 */
15#include <stdio.h>
16#include <stdlib.h>
17#include <fcntl.h>
18#include <string.h>
19#include <unistd.h>
20
21#include "../lib/zipf.h"
22#include "../flist.h"
23#include "../hash.h"
24#include "../rbtree.h"
25
26#define DEF_NR 1000000
27#define DEF_NR_OUTPUT 23
28
29struct node {
30 struct flist_head list;
31 struct rb_node rb;
32 unsigned long long val;
33 unsigned long hits;
34};
35
36static struct flist_head *hash;
37static unsigned long hash_bits = 24;
38static unsigned long hash_size = 1 << 24;
39static struct rb_root rb;
40
41enum {
42 TYPE_NONE = 0,
43 TYPE_ZIPF,
44 TYPE_PARETO,
45};
46static const char *dist_types[] = { "None", "Zipf", "Pareto" };
47
48static int dist_type = TYPE_ZIPF;
49static unsigned long gb_size = 500;
50static unsigned long nranges = DEF_NR;
51static unsigned long output_nranges = DEF_NR_OUTPUT;
52static double dist_val;
53
54#define DEF_ZIPF_VAL 1.2
55#define DEF_PARETO_VAL 0.3
56
57static struct node *hash_lookup(unsigned long long val)
58{
59 struct flist_head *l = &hash[hash_long(val, hash_bits)];
60 struct flist_head *entry;
61 struct node *n;
62
63 flist_for_each(entry, l) {
64 n = flist_entry(entry, struct node, list);
65 if (n->val == val)
66 return n;
67 }
68
69 return NULL;
70}
71
72static void hash_insert(unsigned long long val)
73{
74 struct flist_head *l = &hash[hash_long(val, hash_bits)];
75 struct node *n = malloc(sizeof(*n));
76
77 n->val = val;
78 n->hits = 1;
79 flist_add_tail(&n->list, l);
80}
81
82static void rb_insert(struct node *n)
83{
84 struct rb_node **p, *parent;
85
86 memset(&n->rb, 0, sizeof(n->rb));
87 p = &rb.rb_node;
88 parent = NULL;
89 while (*p) {
90 struct node *__n;
91
92 parent = *p;
93 __n = rb_entry(parent, struct node, rb);
94 if (n->hits > __n->hits)
95 p = &(*p)->rb_left;
96 else
97 p = &(*p)->rb_right;
98 }
99
100 rb_link_node(&n->rb, parent, p);
101 rb_insert_color(&n->rb, &rb);
102}
103
104static unsigned long rb_add(struct flist_head *list)
105{
106 struct flist_head *entry;
107 unsigned long ret = 0;
108 struct node *n;
109
110 flist_for_each(entry, list) {
111 n = flist_entry(entry, struct node, list);
112
113 rb_insert(n);
114 ret++;
115 }
116
117 return ret;
118}
119
120static unsigned long rb_gen(void)
121{
122 unsigned long ret = 0;
123 unsigned int i;
124
125 for (i = 0; i < hash_size; i++)
126 ret += rb_add(&hash[i]);
127
128 return ret;
129}
130
131static int parse_options(int argc, char *argv[])
132{
133 const char *optstring = "t:g:i:r:o:";
134 int c, dist_val_set = 0;
135
136 while ((c = getopt(argc, argv, optstring)) != -1) {
137 switch (c) {
138 case 't':
139 if (!strncmp(optarg, "zipf", 4))
140 dist_type = TYPE_ZIPF;
141 else if (!strncmp(optarg, "pareto", 6))
142 dist_type = TYPE_PARETO;
143 else {
144 printf("wrong dist type: %s\n", optarg);
145 return 1;
146 }
147 break;
148 case 'g':
149 gb_size = strtoul(optarg, NULL, 10);
150 break;
151 case 'i':
152 dist_val = atof(optarg);
153 dist_val_set = 1;
154 break;
155 case 'r':
156 nranges = strtoul(optarg, NULL, 10);
157 break;
158 case 'o':
159 output_nranges = strtoul(optarg, NULL, 10);
160 break;
161 default:
162 printf("bad option %c\n", c);
163 return 1;
164 }
165 }
166
167 if (dist_type == TYPE_PARETO) {
168 if ((dist_val >= 1.00 || dist_val < 0.00)) {
169 printf("pareto input must be > 0.00 and < 1.00\n");
170 return 1;
171 }
172 if (!dist_val_set)
173 dist_val = DEF_PARETO_VAL;
174 } else if (dist_type == TYPE_ZIPF) {
175 if (dist_val == 1.0) {
176 printf("zipf input must be different than 1.0\n");
177 return 1;
178 }
179 if (!dist_val_set)
180 dist_val = DEF_ZIPF_VAL;
181 }
182
183 return 0;
184}
185
186int main(int argc, char *argv[])
187{
188 unsigned long offset;
189 unsigned long i, j, nr_vals, cur_vals, interval;
190 double *output, perc, perc_i;
191 struct zipf_state zs;
192 struct rb_node *n;
193
194 if (parse_options(argc, argv))
195 return 1;
196
197 printf("Generating %s distribution with %f input and %lu ranges.\n", dist_types[dist_type], dist_val, nranges);
198 printf("Using device gb=%lu\n\n", gb_size);
199
200 if (dist_type == TYPE_ZIPF)
201 zipf_init(&zs, nranges, dist_val, 1);
202 else
203 pareto_init(&zs, nranges, dist_val, 1);
204
205 hash_bits = 0;
206 hash_size = nranges;
207 while ((hash_size >>= 1) != 0)
208 hash_bits++;
209
210 hash_size = 1 << hash_bits;
211
212 hash = malloc(hash_size * sizeof(struct flist_head));
213 for (i = 0; i < hash_size; i++)
214 INIT_FLIST_HEAD(&hash[i]);
215
216 for (nr_vals = 0, i = 0; i < nranges; i++) {
217 struct node *n;
218
219 if (dist_type == TYPE_ZIPF)
220 offset = zipf_next(&zs);
221 else
222 offset = pareto_next(&zs);
223
224 n = hash_lookup(offset);
225 if (n)
226 n->hits++;
227 else
228 hash_insert(offset);
229
230 nr_vals++;
231 }
232
233 nr_vals = rb_gen();
234
235 interval = (nr_vals + output_nranges - 1) / output_nranges;
236
237 output = malloc(output_nranges * sizeof(double));
238
239 i = j = cur_vals = 0;
240
241 n = rb_first(&rb);
242 while (n) {
243 struct node *node = rb_entry(n, struct node, rb);
244
245 if (i >= interval) {
246 output[j] = (double) (cur_vals + 1) / (double) nranges;
247 output[j] *= 100.0;
248 j++;
249 cur_vals = node->hits;
250 interval += (nr_vals + output_nranges - 1) / output_nranges;
251 } else
252 cur_vals += node->hits;
253
254 n = rb_next(n);
255 i++;
256 }
257
258 perc_i = 100.0 / (double) output_nranges;
259 perc = 0.0;
260
261 printf(" Rows Hits Size\n");
262 printf("-------------------------------------------\n");
263 for (i = 0; i < j; i++) {
264 double gb = (double) gb_size * perc_i / 100.0;
265 char p = 'G';
266
267 if (gb < 1.0) {
268 p = 'M';
269 gb *= 1024.0;
270 }
271
272 perc += perc_i;
273 printf("%s %6.2f%%\t%6.2f%%\t\t%6.2f%c\n", i ? "|->" : "Top", perc, output[i], gb, p);
274 }
275
276 free(output);
277 free(hash);
278 return 0;
279}