nanosecond: add test program t/time-test for experimenting with cpu clock ticks to...
[fio.git] / t / time-test.c
CommitLineData
dcbc7841
VF
1/*
2 * Carry out arithmetic to explore conversion of CPU clock ticks to nsec
3 *
4 * When we use the CPU clock for timing, we do the following:
5 *
6 * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks
7 * to actual time.
8 *
9 * Using gettimeofday() or clock_gettime(), count how many CPU clock
10 * ticks occur per usec
11 *
12 * 2) Calculate conversion factors so that we can ultimately convert
13 * from clocks ticks to nsec with
14 * nsec = (ticks * clock_mult) >> clock_shift
15 *
16 * This is equivalent to
17 * nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER
18 * where
19 * clock_mult = MULTIPLIER / cycles_per_nsec
20 * MULTIPLIER = 2^clock_shift
21 *
22 * It would be simpler to just calculate nsec = ticks / cycles_per_nsec,
23 * but all of this is necessary because of rounding when calculating
24 * cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply
25 * be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional
26 * portion is lost with integer arithmetic.
27 *
28 * This multiply and shift calculation also has a performance benefit
29 * as multiplication and bit shift operations are faster than integer
30 * division.
31 *
32 * 3) Dynamically determine clock_shift and clock_mult at run time based
33 * on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum
34 * duration for which the conversion will be valid.
35 *
36 * The primary constraint is that (ticks * clock_mult) must not overflow
37 * when ticks is at its maximum value.
38 *
39 * So we have
40 * max_ticks * clock_mult <= ULLONG_MAX
41 * max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX
42 * MULTIPLIER <= ULLONG_MAX / max_ticks * cycles_per_nsec
43 *
44 * Then choose the largest clock_shift that satisfies
45 * 2^clock_shift <= ULLONG_MAX / max_ticks * cycles_per_nsec
46 *
47 * Finally calculate the appropriate clock_mult associated with clock_shift
48 * clock_mult = 2^clock_shift / cycles_per_nsec
49 *
50 * 4) In the code below we have cycles_per_usec and use
51 * cycles_per_nsec = cycles_per_usec / 1000
52 *
53 */
54
55#include <stdio.h>
56#include <stdlib.h>
57#include <limits.h>
58#include <assert.h>
59#include <stdlib.h>
60
61#define DEBUG 0
62#define MAX_CLOCK_SEC 365*24*60*60ULL
63#define MAX_CLOCK_SEC64 60*60ULL
64#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); }
65
66enum {
67 __CLOCK_64_BIT = 1 << 0,
68 __CLOCK_128_BIT = 1 << 1,
69 __CLOCK_MULT_SHIFT = 1 << 2,
70 __CLOCK_EMULATE_128 = 1 << 3,
71 __CLOCK_REDUCE = 1 << 4,
72
73 CLOCK_64_MULT_SHIFT = __CLOCK_64_BIT | __CLOCK_MULT_SHIFT,
74 CLOCK_64_EMULATE_128 = __CLOCK_64_BIT | __CLOCK_EMULATE_128,
75 CLOCK_64_2STAGE = __CLOCK_64_BIT | __CLOCK_REDUCE,
76 CLOCK_128_MULT_SHIFT = __CLOCK_128_BIT | __CLOCK_MULT_SHIFT,
77};
78
79unsigned int clock_shift;
80unsigned int max_cycles_shift;
81unsigned long long max_cycles_mask;
82unsigned long long *nsecs;
83unsigned long long clock_mult;
84unsigned long long nsecs_for_max_cycles;
85unsigned long long clock_mult64_128[2];
86__uint128_t clock_mult128;
87
88
89/*
90 * Functions for carrying out 128-bit
91 * arithmetic using 64-bit integers
92 *
93 * 128-bit integers are stored as
94 * arrays of two 64-bit integers
95 *
96 * Ordering is little endian
97 *
98 * a[0] has the less significant bits
99 * a[1] has the more significant bits
100 */
101void do_mult(unsigned long long a[2], unsigned long long b, unsigned long long product[2])
102{
103 product[0] = product[1] = 0;
104 return;
105}
106
107void do_div(unsigned long long a[2], unsigned long long b, unsigned long long c[2])
108{
109 return;
110}
111
112void do_shift64(unsigned long long a[2], unsigned int count)
113{
114 a[0] = a[1] >> (count-64);
115 a[1] = 0;
116}
117
118void do_shift(unsigned long long a[2], unsigned int count)
119{
120 if (count > 64)
121 do_shift64(a, count);
122 else
123 while (count--) {
124 a[0] >>= 1;
125 a[0] |= a[1] << 63;
126 a[1] >>= 1;
127 }
128}
129
130unsigned long long get_nsec(int mode, unsigned long long t)
131{
132 switch(mode) {
133 case CLOCK_64_MULT_SHIFT: {
134 return (t * clock_mult) >> clock_shift;
135 }
136 case CLOCK_64_EMULATE_128: {
137 unsigned long long product[2];
138 do_mult(clock_mult64_128, t, product);
139 do_shift(product, clock_shift);
140 return product[0];
141 }
142 case CLOCK_64_2STAGE: {
143 unsigned long long multiples, nsec;
144 multiples = t >> max_cycles_shift;
145 dprintf("multiples=%llu\n", multiples);
146 nsec = multiples * nsecs_for_max_cycles;
147 nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
148 return nsec;
149 }
150 case CLOCK_128_MULT_SHIFT: {
151 return (unsigned long long)((t * clock_mult128) >> clock_shift);
152 }
153 default: {
154 assert(0);
155 }
156 }
157}
158
159void calc_mult_shift(int mode, void *mult, unsigned int *shift, unsigned long long max_sec, unsigned long long cycles_per_usec)
160{
161 unsigned long long max_ticks;
162 max_ticks = max_sec * cycles_per_usec * 1000000ULL;
163
164 switch (mode) {
165 case CLOCK_64_MULT_SHIFT: {
166 unsigned long long max_mult, tmp;
167 unsigned int sft = 0;
168
169 /*
170 * Calculate the largest multiplier that will not
171 * produce a 64-bit overflow in the multiplication
172 * step of the clock ticks to nsec conversion
173 */
174 max_mult = ULLONG_MAX / max_ticks;
175 dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult);
176
177 /*
178 * Find the largest shift count that will produce
179 * a multiplier less than max_mult
180 */
181 tmp = max_mult * cycles_per_usec / 1000;
182 while (tmp > 1) {
183 tmp >>= 1;
184 sft++;
185 dprintf("tmp=%llu, sft=%u\n", tmp, sft);
186 }
187
188 *shift = sft;
189 *((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec);
190 break;
191 }
192 case CLOCK_64_EMULATE_128: {
193 unsigned long long max_mult[2], tmp[2];
194 unsigned int sft = 0;
195
196 /*
197 * Calculate the largest multiplier that will not
198 * produce a 128-bit overflow in the multiplication
199 * step of the clock ticks to nsec conversion,
200 * but use only 64-bit integers in the process
201 */
202 max_mult[0] = max_mult[1] = ULLONG_MAX;
203 do_div(max_mult, max_ticks, max_mult);
204 dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
205 max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]);
206
207 /*
208 * Find the largest shift count that will produce
209 * a multiplier less than max_mult
210 */
211 do_div(max_mult, cycles_per_usec, tmp);
212 do_div(tmp, 1000ULL, tmp);
213 while (tmp[0] > 1 || tmp[1] > 1) {
214 do_shift(tmp, 1);
215 sft++;
216 dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft);
217 }
218
219 *shift = sft;
220// *((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
221 break;
222 }
223 case CLOCK_64_2STAGE: {
224 unsigned long long tmp;
225/*
226 * This clock tick to nsec conversion requires two stages.
227 *
228 * Stage 1: Determine how many ~MAX_CLOCK_SEC64 periods worth of clock ticks
229 * have elapsed and set nsecs to the appropriate value for those
230 * ~MAX_CLOCK_SEC64 periods.
231 * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC64 periods from
232 * Stage 1. Convert remaining clock ticks to nsecs and add to previously
233 * set nsec value.
234 *
235 * To optimize the arithmetic operations, use the greatest power of 2 ticks
236 * less than the number of ticks in MAX_CLOCK_SEC64 seconds.
237 *
238 */
239 // Use a period shorter than MAX_CLOCK_SEC here for better accuracy
240 calc_mult_shift(CLOCK_64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC64, cycles_per_usec);
241
242 // Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC64
243 max_cycles_shift = max_cycles_mask = 0;
244 tmp = MAX_CLOCK_SEC64 * 1000000ULL * cycles_per_usec;
245 dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
246 while (tmp > 1) {
247 tmp >>= 1;
248 max_cycles_shift++;
249 dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
250 }
251 // if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will
252 // have a discontinuity every (1ULL << max_cycles_shift) cycles
253 nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift;
254
255 // Use a bitmask to calculate ticks % (1ULL << max_cycles_shift)
256 for (tmp = 0; tmp < max_cycles_shift; tmp++)
257 max_cycles_mask |= 1ULL << tmp;
258
259 dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n",
260 max_cycles_shift, (1ULL << max_cycles_shift),
261 nsecs_for_max_cycles, max_cycles_mask);
262
263
264 break;
265 }
266 case CLOCK_128_MULT_SHIFT: {
267 __uint128_t max_mult, tmp;
268 unsigned int sft = 0;
269
270 /*
271 * Calculate the largest multiplier that will not
272 * produce a 128-bit overflow in the multiplication
273 * step of the clock ticks to nsec conversion
274 */
275 max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX;
276 max_mult /= max_ticks;
277 dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
278 max_ticks, __builtin_clzll(max_ticks),
279 (unsigned long long) (max_mult >> 64),
280 (unsigned long long) max_mult);
281
282 /*
283 * Find the largest shift count that will produce
284 * a multiplier less than max_mult
285 */
286 tmp = max_mult * cycles_per_usec / 1000;
287 while (tmp > 1) {
288 tmp >>= 1;
289 sft++;
290 dprintf("tmp=0x%016llx%016llx, sft=%u\n",
291 (unsigned long long) (tmp >> 64),
292 (unsigned long long) tmp, sft);
293 }
294
295 *shift = sft;
296 *((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
297 break;
298 }
299 }
300}
301
302int discontinuity(int mode, int delta_ticks, int delta_nsec, unsigned long long start, unsigned long len)
303{
304 int i;
305 unsigned long mismatches = 0, bad_mismatches = 0;
306 unsigned long long delta, max_mismatch = 0;
307 unsigned long long *ns = nsecs;
308
309 for (i = 0; i < len; ns++, i++) {
310 *ns = get_nsec(mode, start + i);
311 if (i - delta_ticks >= 0) {
312 if (*ns > *(ns - delta_ticks))
313 delta = *ns - *(ns - delta_ticks);
314 else
315 delta = *(ns - delta_ticks) - *ns;
316 if (delta > delta_nsec)
317 delta -= delta_nsec;
318 else
319 delta = delta_nsec - delta;
320 if (delta) {
321 mismatches++;
322 if (delta > 1)
323 bad_mismatches++;
324 if (delta > max_mismatch)
325 max_mismatch = delta;
326 }
327 }
328 if (!bad_mismatches)
329 assert(max_mismatch == 0 || max_mismatch == 1);
330 if (!mismatches)
331 assert(max_mismatch == 0);
332 }
333
334 printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n",
335 mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start,
336 start + len - 1);
337 return mismatches;
338}
339
340#define MIN_TICKS 1ULL
341#define LEN 1000000000ULL
342#define NSEC_ONE_SEC 1000000000ULL
343#define TESTLEN 9
344long long test_clock(int mode, int cycles_per_usec, int fast_test, int quiet, int delta_ticks, int delta_nsec)
345{
346 int i;
347 long long delta;
348 unsigned long long max_ticks;
349 unsigned long long nsecs;
350 void *mult;
351 unsigned long long test_ns[TESTLEN] =
352 {NSEC_ONE_SEC, NSEC_ONE_SEC,
353 NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60,
354 NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4,
355 NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24};
356 unsigned long long test_ticks[TESTLEN];
357
358 max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL;
359
360 switch(mode) {
361 case CLOCK_64_MULT_SHIFT: {
362 mult = &clock_mult;
363 break;
364 }
365 case CLOCK_64_EMULATE_128: {
366 mult = clock_mult64_128;
367 break;
368 }
369 case CLOCK_64_2STAGE: {
370 mult = &clock_mult;
371 break;
372 }
373 case CLOCK_128_MULT_SHIFT: {
374 mult = &clock_mult128;
375 break;
376 }
377 }
378 calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec);
379 nsecs = get_nsec(mode, max_ticks);
380 delta = nsecs/1000000 - MAX_CLOCK_SEC*1000;
381
382 if (mode == CLOCK_64_2STAGE) {
383 test_ns[0] = nsecs_for_max_cycles - 1;
384 test_ns[1] = nsecs_for_max_cycles;
385 test_ticks[0] = (1ULL << max_cycles_shift) - 1;
386 test_ticks[1] = (1ULL << max_cycles_shift);
387
388 for (i = 2; i < TESTLEN; i++)
389 test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
390 }
391 else {
392 for (i = 0; i < TESTLEN; i++)
393 test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
394 }
395
396 if (!quiet) {
397 printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n",
398 cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift));
399 switch(mode) {
400 case CLOCK_64_2STAGE:
401 case CLOCK_64_MULT_SHIFT: {
402 printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n",
403 clock_mult, (double) clock_mult / (1ULL << clock_shift));
404 break;
405 }
406 case CLOCK_64_EMULATE_128: {
407 printf("clock_mult=0x%016llx%016llx\n",
408 clock_mult64_128[1], clock_mult64_128[0]);
409 break;
410 }
411 case CLOCK_128_MULT_SHIFT: {
412 printf("clock_mult=0x%016llx%016llx\n",
413 (unsigned long long) (clock_mult128 >> 64),
414 (unsigned long long) clock_mult128);
415 break;
416 }
417 }
418 printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n",
419 nsecs, MAX_CLOCK_SEC*1000000000ULL, delta);
420 }
421
422 for (i = 0; i < TESTLEN; i++)
423 {
424 nsecs = get_nsec(mode, test_ticks[i]);
425 delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs;
426 if (!quiet || delta > 0)
427 printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n",
428 test_ticks[i], nsecs, test_ns[i], delta);
429 }
430
431 if (!fast_test) {
432 discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN);
433 discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN);
434 }
435
436 if (!quiet)
437 printf("\n\n");
438
439 return delta;
440}
441
442int main(int argc, char *argv[])
443{
444 int i, days;
445 long long error;
446 long long errors[10001];
447 double mean;
448
449 nsecs = malloc(LEN * sizeof(unsigned long long));
450 assert(nsecs != NULL);
451 days = MAX_CLOCK_SEC / 60 / 60 / 24;
452
453 test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 0, 0);
454// test_clock(CLOCK_64_MULT_SHIFT, 3333, 1, 0, 0, 0);
455// test_clock(CLOCK_128_MULT_SHIFT, 3333, 1, 0, 0, 0);
456
457// Test 3 different clock types from 1000 to 10000 MHz
458// and calculate average error
459/*
460 for (i = 1000, mean = 0.0; i <= 10000; i++) {
461 error = test_clock(CLOCK_64_MULT_SHIFT, i, 1, 1, 0, 0);
462 errors[i] = error > 0 ? error : -1LL * error;
463 mean += (double) errors[i] / 9000;
464 }
465 printf(" 64-bit average error per %d days: %fms\n", days, mean);
466
467 for (i = 1000, mean = 0.0; i <= 10000; i++) {
468 error = test_clock(CLOCK_64_2STAGE, i, 1, 1, 0, 0);
469 errors[i] = error > 0 ? error : -1LL * error;
470 mean += (double) errors[i] / 9000;
471 }
472 printf(" 64-bit two-stage average error per %d days: %fms\n", days, mean);
473
474 for (i = 1000, mean = 0.0; i <= 10000; i++) {
475 error = test_clock(CLOCK_128_MULT_SHIFT, i, 1, 1, 0, 0);
476 errors[i] = error > 0 ? error : -1LL * error;
477 mean += (double) errors[i] / 9000;
478 }
479 printf(" 128-bit average error per %d days: %fms\n", days, mean);
480*/
481 test_clock(CLOCK_64_2STAGE, 1000, 1, 0, 1, 1);
482 test_clock(CLOCK_64_2STAGE, 1100, 1, 0, 11, 10);
483 test_clock(CLOCK_64_2STAGE, 3000, 1, 0, 3, 1);
484 test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 3333, 1000);
485 test_clock(CLOCK_64_2STAGE, 3392, 1, 0, 424, 125);
486 test_clock(CLOCK_64_2STAGE, 4500, 1, 0, 9, 2);
487 test_clock(CLOCK_64_2STAGE, 5000, 1, 0, 5, 1);
488
489 free(nsecs);
490 return 0;
491}