Fix Windows CPU count
[fio.git] / t / time-test.c
... / ...
CommitLineData
1/*
2 * Carry out arithmetic to explore conversion of CPU clock ticks to nsec
3 *
4 * When we use the CPU clock for timing, we do the following:
5 *
6 * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks
7 * to actual time.
8 *
9 * Using gettimeofday() or clock_gettime(), count how many CPU clock
10 * ticks occur per usec
11 *
12 * 2) Calculate conversion factors so that we can ultimately convert
13 * from clocks ticks to nsec with
14 * nsec = (ticks * clock_mult) >> clock_shift
15 *
16 * This is equivalent to
17 * nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER
18 * where
19 * clock_mult = MULTIPLIER / cycles_per_nsec
20 * MULTIPLIER = 2^clock_shift
21 *
22 * It would be simpler to just calculate nsec = ticks / cycles_per_nsec,
23 * but all of this is necessary because of rounding when calculating
24 * cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply
25 * be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional
26 * portion is lost with integer arithmetic.
27 *
28 * This multiply and shift calculation also has a performance benefit
29 * as multiplication and bit shift operations are faster than integer
30 * division.
31 *
32 * 3) Dynamically determine clock_shift and clock_mult at run time based
33 * on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum
34 * duration for which the conversion will be valid.
35 *
36 * The primary constraint is that (ticks * clock_mult) must not overflow
37 * when ticks is at its maximum value.
38 *
39 * So we have
40 * max_ticks = MAX_CLOCK_SEC * 1000000000 * cycles_per_nsec
41 * max_ticks * clock_mult <= ULLONG_MAX
42 * max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX
43 * MULTIPLIER <= ULLONG_MAX * cycles_per_nsec / max_ticks
44 *
45 * Then choose the largest clock_shift that satisfies
46 * 2^clock_shift <= ULLONG_MAX * cycles_per_nsec / max_ticks
47 *
48 * Finally calculate the appropriate clock_mult associated with clock_shift
49 * clock_mult = 2^clock_shift / cycles_per_nsec
50 *
51 * 4) In the code below we have cycles_per_usec and use
52 * cycles_per_nsec = cycles_per_usec / 1000
53 *
54 *
55 * The code below implements 4 clock tick to nsec conversion strategies
56 *
57 * i) 64-bit arithmetic for the (ticks * clock_mult) product with the
58 * conversion valid for at most MAX_CLOCK_SEC
59 *
60 * ii) NOT IMPLEMENTED Use 64-bit integers to emulate 128-bit multiplication
61 * for the (ticks * clock_mult) product
62 *
63 * iii) 64-bit arithmetic with clock ticks to nsec conversion occurring in
64 * two stages. The first stage counts the number of discrete, large chunks
65 * of time that have elapsed. To this is added the time represented by
66 * the remaining clock ticks. The advantage of this strategy is better
67 * accuracy because the (ticks * clock_mult) product used for final
68 * fractional chunk
69 *
70 * iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in
71 * two stages. This is carried out using locks to update the number of
72 * large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed.
73 *
74 * v) 128-bit arithmetic used for the clock ticks to nsec conversion.
75 *
76 */
77
78#include <stdio.h>
79#include <stdlib.h>
80#include <limits.h>
81#include <assert.h>
82#include <stdlib.h>
83#include "lib/seqlock.h"
84
85#define DEBUG 0
86#define MAX_CLOCK_SEC 365*24*60*60ULL
87#define MAX_CLOCK_SEC_2STAGE 60*60ULL
88#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); }
89
90enum {
91 __CLOCK64_BIT = 1 << 0,
92 __CLOCK128_BIT = 1 << 1,
93 __CLOCK_MULT_SHIFT = 1 << 2,
94 __CLOCK_EMULATE_128 = 1 << 3,
95 __CLOCK_2STAGE = 1 << 4,
96 __CLOCK_LOCK = 1 << 5,
97
98 CLOCK64_MULT_SHIFT = __CLOCK64_BIT | __CLOCK_MULT_SHIFT,
99 CLOCK64_EMULATE_128 = __CLOCK64_BIT | __CLOCK_EMULATE_128,
100 CLOCK64_2STAGE = __CLOCK64_BIT | __CLOCK_2STAGE,
101 CLOCK64_LOCK = __CLOCK64_BIT | __CLOCK_LOCK,
102 CLOCK128_MULT_SHIFT = __CLOCK128_BIT | __CLOCK_MULT_SHIFT,
103};
104
105static struct seqlock clock_seqlock;
106static unsigned long long cycles_start;
107static unsigned long long elapsed_nsec;
108
109static unsigned int max_cycles_shift;
110static unsigned long long max_cycles_mask;
111static unsigned long long nsecs_for_max_cycles;
112
113static unsigned int clock_shift;
114static unsigned long long clock_mult;
115
116static unsigned long long *nsecs;
117static unsigned long long clock_mult64_128[2];
118static __uint128_t clock_mult128;
119
120/*
121 * Functions for carrying out 128-bit
122 * arithmetic using 64-bit integers
123 *
124 * 128-bit integers are stored as
125 * arrays of two 64-bit integers
126 *
127 * Ordering is little endian
128 *
129 * a[0] has the less significant bits
130 * a[1] has the more significant bits
131 *
132 * NOT FULLY IMPLEMENTED
133 */
134static void do_mult(unsigned long long a[2], unsigned long long b,
135 unsigned long long product[2])
136{
137 product[0] = product[1] = 0;
138 return;
139}
140
141static void do_div(unsigned long long a[2], unsigned long long b,
142 unsigned long long c[2])
143{
144 return;
145}
146
147static void do_shift64(unsigned long long a[2], unsigned int count)
148{
149 a[0] = a[1] >> (count-64);
150 a[1] = 0;
151}
152
153static void do_shift(unsigned long long a[2], unsigned int count)
154{
155 if (count > 64)
156 do_shift64(a, count);
157 else {
158 while (count--) {
159 a[0] >>= 1;
160 a[0] |= a[1] << 63;
161 a[1] >>= 1;
162 }
163 }
164}
165
166static void update_clock(unsigned long long t)
167{
168 write_seqlock_begin(&clock_seqlock);
169 elapsed_nsec = (t >> max_cycles_shift) * nsecs_for_max_cycles;
170 cycles_start = t & ~max_cycles_mask;
171 write_seqlock_end(&clock_seqlock);
172}
173
174static unsigned long long _get_nsec(int mode, unsigned long long t)
175{
176 switch(mode) {
177 case CLOCK64_MULT_SHIFT:
178 return (t * clock_mult) >> clock_shift;
179 case CLOCK64_EMULATE_128: {
180 unsigned long long product[2] = { };
181
182 do_mult(clock_mult64_128, t, product);
183 do_shift(product, clock_shift);
184 return product[0];
185 }
186 case CLOCK64_2STAGE: {
187 unsigned long long multiples, nsec;
188
189 multiples = t >> max_cycles_shift;
190 dprintf("multiples=%llu\n", multiples);
191 nsec = multiples * nsecs_for_max_cycles;
192 nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
193 return nsec;
194 }
195 case CLOCK64_LOCK: {
196 unsigned int seq;
197 unsigned long long nsec;
198
199 do {
200 seq = read_seqlock_begin(&clock_seqlock);
201 nsec = elapsed_nsec;
202 nsec += ((t - cycles_start) * clock_mult) >> clock_shift;
203 } while (read_seqlock_retry(&clock_seqlock, seq));
204 return nsec;
205 }
206 case CLOCK128_MULT_SHIFT:
207 return (unsigned long long)((t * clock_mult128) >> clock_shift);
208 default:
209 assert(0);
210 }
211}
212
213static unsigned long long get_nsec(int mode, unsigned long long t)
214{
215 if (mode == CLOCK64_LOCK) {
216 update_clock(t);
217 }
218
219 return _get_nsec(mode, t);
220}
221
222static void calc_mult_shift(int mode, void *mult, unsigned int *shift,
223 unsigned long long max_sec,
224 unsigned long long cycles_per_usec)
225{
226 unsigned long long max_ticks;
227 max_ticks = max_sec * cycles_per_usec * 1000000ULL;
228
229 switch (mode) {
230 case CLOCK64_MULT_SHIFT: {
231 unsigned long long max_mult, tmp;
232 unsigned int sft = 0;
233
234 /*
235 * Calculate the largest multiplier that will not
236 * produce a 64-bit overflow in the multiplication
237 * step of the clock ticks to nsec conversion
238 */
239 max_mult = ULLONG_MAX / max_ticks;
240 dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult);
241
242 /*
243 * Find the largest shift count that will produce
244 * a multiplier less than max_mult
245 */
246 tmp = max_mult * cycles_per_usec / 1000;
247 while (tmp > 1) {
248 tmp >>= 1;
249 sft++;
250 dprintf("tmp=%llu, sft=%u\n", tmp, sft);
251 }
252
253 *shift = sft;
254 *((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec);
255 break;
256 }
257 case CLOCK64_EMULATE_128: {
258 unsigned long long max_mult[2], tmp[2] = { };
259 unsigned int sft = 0;
260
261 /*
262 * Calculate the largest multiplier that will not
263 * produce a 128-bit overflow in the multiplication
264 * step of the clock ticks to nsec conversion,
265 * but use only 64-bit integers in the process
266 */
267 max_mult[0] = max_mult[1] = ULLONG_MAX;
268 do_div(max_mult, max_ticks, max_mult);
269 dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
270 max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]);
271
272 /*
273 * Find the largest shift count that will produce
274 * a multiplier less than max_mult
275 */
276 do_div(max_mult, cycles_per_usec, tmp);
277 do_div(tmp, 1000ULL, tmp);
278 while (tmp[0] > 1 || tmp[1] > 1) {
279 do_shift(tmp, 1);
280 sft++;
281 dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft);
282 }
283
284 *shift = sft;
285// *((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
286 break;
287 }
288 case CLOCK64_2STAGE: {
289 unsigned long long tmp;
290/*
291 * This clock tick to nsec conversion requires two stages.
292 *
293 * Stage 1: Determine how many ~MAX_CLOCK_SEC_2STAGE periods worth of clock ticks
294 * have elapsed and set nsecs to the appropriate value for those
295 * ~MAX_CLOCK_SEC_2STAGE periods.
296 * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC_2STAGE periods from
297 * Stage 1. Convert remaining clock ticks to nsecs and add to previously
298 * set nsec value.
299 *
300 * To optimize the arithmetic operations, use the greatest power of 2 ticks
301 * less than the number of ticks in MAX_CLOCK_SEC_2STAGE seconds.
302 *
303 */
304 // Use a period shorter than MAX_CLOCK_SEC here for better accuracy
305 calc_mult_shift(CLOCK64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC_2STAGE, cycles_per_usec);
306
307 // Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC_2STAGE
308 max_cycles_shift = max_cycles_mask = 0;
309 tmp = MAX_CLOCK_SEC_2STAGE * 1000000ULL * cycles_per_usec;
310 dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
311 while (tmp > 1) {
312 tmp >>= 1;
313 max_cycles_shift++;
314 dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
315 }
316 // if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will
317 // have a discontinuity every (1ULL << max_cycles_shift) cycles
318 nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift;
319
320 // Use a bitmask to calculate ticks % (1ULL << max_cycles_shift)
321 for (tmp = 0; tmp < max_cycles_shift; tmp++)
322 max_cycles_mask |= 1ULL << tmp;
323
324 dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n",
325 max_cycles_shift, (1ULL << max_cycles_shift),
326 nsecs_for_max_cycles, max_cycles_mask);
327
328
329 break;
330 }
331 case CLOCK64_LOCK: {
332/*
333 * This clock tick to nsec conversion also requires two stages.
334 *
335 * Stage 1: Add to nsec the current running total of elapsed long periods
336 * Stage 2: Subtract from clock ticks the tick count corresponding to the
337 * most recently elapsed long period. Convert the remaining ticks to
338 * nsec and add to the previous nsec value.
339 *
340 * In practice the elapsed nsec from Stage 1 and the tick count subtracted
341 * in Stage 2 will be maintained in a separate thread.
342 *
343 */
344 calc_mult_shift(CLOCK64_2STAGE, mult, shift, MAX_CLOCK_SEC, cycles_per_usec);
345 cycles_start = 0;
346 break;
347 }
348 case CLOCK128_MULT_SHIFT: {
349 __uint128_t max_mult, tmp;
350 unsigned int sft = 0;
351
352 /*
353 * Calculate the largest multiplier that will not
354 * produce a 128-bit overflow in the multiplication
355 * step of the clock ticks to nsec conversion
356 */
357 max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX;
358 max_mult /= max_ticks;
359 dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
360 max_ticks, __builtin_clzll(max_ticks),
361 (unsigned long long) (max_mult >> 64),
362 (unsigned long long) max_mult);
363
364 /*
365 * Find the largest shift count that will produce
366 * a multiplier less than max_mult
367 */
368 tmp = max_mult * cycles_per_usec / 1000;
369 while (tmp > 1) {
370 tmp >>= 1;
371 sft++;
372 dprintf("tmp=0x%016llx%016llx, sft=%u\n",
373 (unsigned long long) (tmp >> 64),
374 (unsigned long long) tmp, sft);
375 }
376
377 *shift = sft;
378 *((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
379 break;
380 }
381 }
382}
383
384static int discontinuity(int mode, int delta_ticks, int delta_nsec,
385 unsigned long long start, unsigned long len)
386{
387 int i;
388 unsigned long mismatches = 0, bad_mismatches = 0;
389 unsigned long long delta, max_mismatch = 0;
390 unsigned long long *ns = nsecs;
391
392 for (i = 0; i < len; ns++, i++) {
393 *ns = get_nsec(mode, start + i);
394 if (i - delta_ticks >= 0) {
395 if (*ns > *(ns - delta_ticks))
396 delta = *ns - *(ns - delta_ticks);
397 else
398 delta = *(ns - delta_ticks) - *ns;
399 if (delta > delta_nsec)
400 delta -= delta_nsec;
401 else
402 delta = delta_nsec - delta;
403 if (delta) {
404 mismatches++;
405 if (delta > 1)
406 bad_mismatches++;
407 if (delta > max_mismatch)
408 max_mismatch = delta;
409 }
410 }
411 if (!bad_mismatches)
412 assert(max_mismatch == 0 || max_mismatch == 1);
413 if (!mismatches)
414 assert(max_mismatch == 0);
415 }
416
417 printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n",
418 mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start,
419 start + len - 1);
420 return mismatches;
421}
422
423#define MIN_TICKS 1ULL
424#define LEN 1000000000ULL
425#define NSEC_ONE_SEC 1000000000ULL
426#define TESTLEN 9
427
428static long long test_clock(int mode, int cycles_per_usec, int fast_test,
429 int quiet, int delta_ticks, int delta_nsec)
430{
431 int i;
432 long long delta;
433 unsigned long long max_ticks;
434 unsigned long long nsecs;
435 void *mult;
436 unsigned long long test_ns[TESTLEN] =
437 {NSEC_ONE_SEC, NSEC_ONE_SEC,
438 NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60,
439 NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4,
440 NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24};
441 unsigned long long test_ticks[TESTLEN];
442
443 max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL;
444
445 switch(mode) {
446 case CLOCK64_MULT_SHIFT:
447 mult = &clock_mult;
448 break;
449 case CLOCK64_EMULATE_128:
450 mult = clock_mult64_128;
451 break;
452 case CLOCK64_2STAGE:
453 mult = &clock_mult;
454 break;
455 case CLOCK64_LOCK:
456 mult = &clock_mult;
457 break;
458 case CLOCK128_MULT_SHIFT:
459 mult = &clock_mult128;
460 break;
461 default:
462 assert(0);
463 }
464 calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec);
465 nsecs = get_nsec(mode, max_ticks);
466 delta = nsecs/1000000 - MAX_CLOCK_SEC*1000;
467
468 if (mode == CLOCK64_2STAGE) {
469 test_ns[0] = nsecs_for_max_cycles - 1;
470 test_ns[1] = nsecs_for_max_cycles;
471 test_ticks[0] = (1ULL << max_cycles_shift) - 1;
472 test_ticks[1] = (1ULL << max_cycles_shift);
473
474 for (i = 2; i < TESTLEN; i++)
475 test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
476 }
477 else {
478 for (i = 0; i < TESTLEN; i++)
479 test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
480 }
481
482 if (!quiet) {
483 printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n",
484 cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift));
485 switch(mode) {
486 case CLOCK64_LOCK:
487 case CLOCK64_2STAGE:
488 case CLOCK64_MULT_SHIFT: {
489 printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n",
490 clock_mult, (double) clock_mult / (1ULL << clock_shift));
491 break;
492 }
493 case CLOCK64_EMULATE_128: {
494 printf("clock_mult=0x%016llx%016llx\n",
495 clock_mult64_128[1], clock_mult64_128[0]);
496 break;
497 }
498 case CLOCK128_MULT_SHIFT: {
499 printf("clock_mult=0x%016llx%016llx\n",
500 (unsigned long long) (clock_mult128 >> 64),
501 (unsigned long long) clock_mult128);
502 break;
503 }
504 }
505 printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n",
506 nsecs, MAX_CLOCK_SEC*1000000000ULL, delta);
507 }
508
509 for (i = 0; i < TESTLEN; i++)
510 {
511 nsecs = get_nsec(mode, test_ticks[i]);
512 delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs;
513 if (!quiet || delta > 0)
514 printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n",
515 test_ticks[i], nsecs, test_ns[i], delta);
516 }
517
518 if (!fast_test) {
519 discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN);
520 discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN);
521 }
522
523 if (!quiet)
524 printf("\n\n");
525
526 return delta;
527}
528
529int main(int argc, char *argv[])
530{
531 nsecs = malloc(LEN * sizeof(unsigned long long));
532
533 test_clock(CLOCK64_LOCK, 3333, 1, 0, 0, 0);
534 test_clock(CLOCK64_LOCK, 1000, 1, 0, 1, 1);
535 test_clock(CLOCK64_LOCK, 1100, 1, 0, 11, 10);
536 test_clock(CLOCK64_LOCK, 3000, 1, 0, 3, 1);
537 test_clock(CLOCK64_LOCK, 3333, 1, 0, 3333, 1000);
538 test_clock(CLOCK64_LOCK, 3392, 1, 0, 424, 125);
539 test_clock(CLOCK64_LOCK, 4500, 1, 0, 9, 2);
540 test_clock(CLOCK64_LOCK, 5000, 1, 0, 5, 1);
541
542 free(nsecs);
543 return 0;
544}