Commit | Line | Data |
---|---|---|
dcbc7841 VF |
1 | /* |
2 | * Carry out arithmetic to explore conversion of CPU clock ticks to nsec | |
3 | * | |
4 | * When we use the CPU clock for timing, we do the following: | |
5 | * | |
6 | * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks | |
7 | * to actual time. | |
8 | * | |
9 | * Using gettimeofday() or clock_gettime(), count how many CPU clock | |
10 | * ticks occur per usec | |
11 | * | |
12 | * 2) Calculate conversion factors so that we can ultimately convert | |
13 | * from clocks ticks to nsec with | |
14 | * nsec = (ticks * clock_mult) >> clock_shift | |
15 | * | |
16 | * This is equivalent to | |
17 | * nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER | |
18 | * where | |
19 | * clock_mult = MULTIPLIER / cycles_per_nsec | |
20 | * MULTIPLIER = 2^clock_shift | |
21 | * | |
22 | * It would be simpler to just calculate nsec = ticks / cycles_per_nsec, | |
23 | * but all of this is necessary because of rounding when calculating | |
24 | * cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply | |
25 | * be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional | |
26 | * portion is lost with integer arithmetic. | |
27 | * | |
28 | * This multiply and shift calculation also has a performance benefit | |
29 | * as multiplication and bit shift operations are faster than integer | |
30 | * division. | |
31 | * | |
32 | * 3) Dynamically determine clock_shift and clock_mult at run time based | |
33 | * on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum | |
34 | * duration for which the conversion will be valid. | |
35 | * | |
36 | * The primary constraint is that (ticks * clock_mult) must not overflow | |
37 | * when ticks is at its maximum value. | |
38 | * | |
39 | * So we have | |
40 | * max_ticks * clock_mult <= ULLONG_MAX | |
41 | * max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX | |
42 | * MULTIPLIER <= ULLONG_MAX / max_ticks * cycles_per_nsec | |
43 | * | |
44 | * Then choose the largest clock_shift that satisfies | |
45 | * 2^clock_shift <= ULLONG_MAX / max_ticks * cycles_per_nsec | |
46 | * | |
47 | * Finally calculate the appropriate clock_mult associated with clock_shift | |
48 | * clock_mult = 2^clock_shift / cycles_per_nsec | |
49 | * | |
50 | * 4) In the code below we have cycles_per_usec and use | |
51 | * cycles_per_nsec = cycles_per_usec / 1000 | |
52 | * | |
53 | */ | |
54 | ||
55 | #include <stdio.h> | |
56 | #include <stdlib.h> | |
57 | #include <limits.h> | |
58 | #include <assert.h> | |
59 | #include <stdlib.h> | |
60 | ||
61 | #define DEBUG 0 | |
62 | #define MAX_CLOCK_SEC 365*24*60*60ULL | |
63 | #define MAX_CLOCK_SEC64 60*60ULL | |
64 | #define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); } | |
65 | ||
66 | enum { | |
67 | __CLOCK_64_BIT = 1 << 0, | |
68 | __CLOCK_128_BIT = 1 << 1, | |
69 | __CLOCK_MULT_SHIFT = 1 << 2, | |
70 | __CLOCK_EMULATE_128 = 1 << 3, | |
71 | __CLOCK_REDUCE = 1 << 4, | |
72 | ||
73 | CLOCK_64_MULT_SHIFT = __CLOCK_64_BIT | __CLOCK_MULT_SHIFT, | |
74 | CLOCK_64_EMULATE_128 = __CLOCK_64_BIT | __CLOCK_EMULATE_128, | |
75 | CLOCK_64_2STAGE = __CLOCK_64_BIT | __CLOCK_REDUCE, | |
76 | CLOCK_128_MULT_SHIFT = __CLOCK_128_BIT | __CLOCK_MULT_SHIFT, | |
77 | }; | |
78 | ||
79 | unsigned int clock_shift; | |
80 | unsigned int max_cycles_shift; | |
81 | unsigned long long max_cycles_mask; | |
82 | unsigned long long *nsecs; | |
83 | unsigned long long clock_mult; | |
84 | unsigned long long nsecs_for_max_cycles; | |
85 | unsigned long long clock_mult64_128[2]; | |
86 | __uint128_t clock_mult128; | |
87 | ||
88 | ||
89 | /* | |
90 | * Functions for carrying out 128-bit | |
91 | * arithmetic using 64-bit integers | |
92 | * | |
93 | * 128-bit integers are stored as | |
94 | * arrays of two 64-bit integers | |
95 | * | |
96 | * Ordering is little endian | |
97 | * | |
98 | * a[0] has the less significant bits | |
99 | * a[1] has the more significant bits | |
100 | */ | |
101 | void do_mult(unsigned long long a[2], unsigned long long b, unsigned long long product[2]) | |
102 | { | |
103 | product[0] = product[1] = 0; | |
104 | return; | |
105 | } | |
106 | ||
107 | void do_div(unsigned long long a[2], unsigned long long b, unsigned long long c[2]) | |
108 | { | |
109 | return; | |
110 | } | |
111 | ||
112 | void do_shift64(unsigned long long a[2], unsigned int count) | |
113 | { | |
114 | a[0] = a[1] >> (count-64); | |
115 | a[1] = 0; | |
116 | } | |
117 | ||
118 | void do_shift(unsigned long long a[2], unsigned int count) | |
119 | { | |
120 | if (count > 64) | |
121 | do_shift64(a, count); | |
122 | else | |
123 | while (count--) { | |
124 | a[0] >>= 1; | |
125 | a[0] |= a[1] << 63; | |
126 | a[1] >>= 1; | |
127 | } | |
128 | } | |
129 | ||
130 | unsigned long long get_nsec(int mode, unsigned long long t) | |
131 | { | |
132 | switch(mode) { | |
133 | case CLOCK_64_MULT_SHIFT: { | |
134 | return (t * clock_mult) >> clock_shift; | |
135 | } | |
136 | case CLOCK_64_EMULATE_128: { | |
137 | unsigned long long product[2]; | |
138 | do_mult(clock_mult64_128, t, product); | |
139 | do_shift(product, clock_shift); | |
140 | return product[0]; | |
141 | } | |
142 | case CLOCK_64_2STAGE: { | |
143 | unsigned long long multiples, nsec; | |
144 | multiples = t >> max_cycles_shift; | |
145 | dprintf("multiples=%llu\n", multiples); | |
146 | nsec = multiples * nsecs_for_max_cycles; | |
147 | nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift; | |
148 | return nsec; | |
149 | } | |
150 | case CLOCK_128_MULT_SHIFT: { | |
151 | return (unsigned long long)((t * clock_mult128) >> clock_shift); | |
152 | } | |
153 | default: { | |
154 | assert(0); | |
155 | } | |
156 | } | |
157 | } | |
158 | ||
159 | void calc_mult_shift(int mode, void *mult, unsigned int *shift, unsigned long long max_sec, unsigned long long cycles_per_usec) | |
160 | { | |
161 | unsigned long long max_ticks; | |
162 | max_ticks = max_sec * cycles_per_usec * 1000000ULL; | |
163 | ||
164 | switch (mode) { | |
165 | case CLOCK_64_MULT_SHIFT: { | |
166 | unsigned long long max_mult, tmp; | |
167 | unsigned int sft = 0; | |
168 | ||
169 | /* | |
170 | * Calculate the largest multiplier that will not | |
171 | * produce a 64-bit overflow in the multiplication | |
172 | * step of the clock ticks to nsec conversion | |
173 | */ | |
174 | max_mult = ULLONG_MAX / max_ticks; | |
175 | dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult); | |
176 | ||
177 | /* | |
178 | * Find the largest shift count that will produce | |
179 | * a multiplier less than max_mult | |
180 | */ | |
181 | tmp = max_mult * cycles_per_usec / 1000; | |
182 | while (tmp > 1) { | |
183 | tmp >>= 1; | |
184 | sft++; | |
185 | dprintf("tmp=%llu, sft=%u\n", tmp, sft); | |
186 | } | |
187 | ||
188 | *shift = sft; | |
189 | *((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec); | |
190 | break; | |
191 | } | |
192 | case CLOCK_64_EMULATE_128: { | |
193 | unsigned long long max_mult[2], tmp[2]; | |
194 | unsigned int sft = 0; | |
195 | ||
196 | /* | |
197 | * Calculate the largest multiplier that will not | |
198 | * produce a 128-bit overflow in the multiplication | |
199 | * step of the clock ticks to nsec conversion, | |
200 | * but use only 64-bit integers in the process | |
201 | */ | |
202 | max_mult[0] = max_mult[1] = ULLONG_MAX; | |
203 | do_div(max_mult, max_ticks, max_mult); | |
204 | dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n", | |
205 | max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]); | |
206 | ||
207 | /* | |
208 | * Find the largest shift count that will produce | |
209 | * a multiplier less than max_mult | |
210 | */ | |
211 | do_div(max_mult, cycles_per_usec, tmp); | |
212 | do_div(tmp, 1000ULL, tmp); | |
213 | while (tmp[0] > 1 || tmp[1] > 1) { | |
214 | do_shift(tmp, 1); | |
215 | sft++; | |
216 | dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft); | |
217 | } | |
218 | ||
219 | *shift = sft; | |
220 | // *((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec); | |
221 | break; | |
222 | } | |
223 | case CLOCK_64_2STAGE: { | |
224 | unsigned long long tmp; | |
225 | /* | |
226 | * This clock tick to nsec conversion requires two stages. | |
227 | * | |
228 | * Stage 1: Determine how many ~MAX_CLOCK_SEC64 periods worth of clock ticks | |
229 | * have elapsed and set nsecs to the appropriate value for those | |
230 | * ~MAX_CLOCK_SEC64 periods. | |
231 | * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC64 periods from | |
232 | * Stage 1. Convert remaining clock ticks to nsecs and add to previously | |
233 | * set nsec value. | |
234 | * | |
235 | * To optimize the arithmetic operations, use the greatest power of 2 ticks | |
236 | * less than the number of ticks in MAX_CLOCK_SEC64 seconds. | |
237 | * | |
238 | */ | |
239 | // Use a period shorter than MAX_CLOCK_SEC here for better accuracy | |
240 | calc_mult_shift(CLOCK_64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC64, cycles_per_usec); | |
241 | ||
242 | // Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC64 | |
243 | max_cycles_shift = max_cycles_mask = 0; | |
244 | tmp = MAX_CLOCK_SEC64 * 1000000ULL * cycles_per_usec; | |
245 | dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift); | |
246 | while (tmp > 1) { | |
247 | tmp >>= 1; | |
248 | max_cycles_shift++; | |
249 | dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift); | |
250 | } | |
251 | // if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will | |
252 | // have a discontinuity every (1ULL << max_cycles_shift) cycles | |
253 | nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift; | |
254 | ||
255 | // Use a bitmask to calculate ticks % (1ULL << max_cycles_shift) | |
256 | for (tmp = 0; tmp < max_cycles_shift; tmp++) | |
257 | max_cycles_mask |= 1ULL << tmp; | |
258 | ||
259 | dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n", | |
260 | max_cycles_shift, (1ULL << max_cycles_shift), | |
261 | nsecs_for_max_cycles, max_cycles_mask); | |
262 | ||
263 | ||
264 | break; | |
265 | } | |
266 | case CLOCK_128_MULT_SHIFT: { | |
267 | __uint128_t max_mult, tmp; | |
268 | unsigned int sft = 0; | |
269 | ||
270 | /* | |
271 | * Calculate the largest multiplier that will not | |
272 | * produce a 128-bit overflow in the multiplication | |
273 | * step of the clock ticks to nsec conversion | |
274 | */ | |
275 | max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX; | |
276 | max_mult /= max_ticks; | |
277 | dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n", | |
278 | max_ticks, __builtin_clzll(max_ticks), | |
279 | (unsigned long long) (max_mult >> 64), | |
280 | (unsigned long long) max_mult); | |
281 | ||
282 | /* | |
283 | * Find the largest shift count that will produce | |
284 | * a multiplier less than max_mult | |
285 | */ | |
286 | tmp = max_mult * cycles_per_usec / 1000; | |
287 | while (tmp > 1) { | |
288 | tmp >>= 1; | |
289 | sft++; | |
290 | dprintf("tmp=0x%016llx%016llx, sft=%u\n", | |
291 | (unsigned long long) (tmp >> 64), | |
292 | (unsigned long long) tmp, sft); | |
293 | } | |
294 | ||
295 | *shift = sft; | |
296 | *((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec); | |
297 | break; | |
298 | } | |
299 | } | |
300 | } | |
301 | ||
302 | int discontinuity(int mode, int delta_ticks, int delta_nsec, unsigned long long start, unsigned long len) | |
303 | { | |
304 | int i; | |
305 | unsigned long mismatches = 0, bad_mismatches = 0; | |
306 | unsigned long long delta, max_mismatch = 0; | |
307 | unsigned long long *ns = nsecs; | |
308 | ||
309 | for (i = 0; i < len; ns++, i++) { | |
310 | *ns = get_nsec(mode, start + i); | |
311 | if (i - delta_ticks >= 0) { | |
312 | if (*ns > *(ns - delta_ticks)) | |
313 | delta = *ns - *(ns - delta_ticks); | |
314 | else | |
315 | delta = *(ns - delta_ticks) - *ns; | |
316 | if (delta > delta_nsec) | |
317 | delta -= delta_nsec; | |
318 | else | |
319 | delta = delta_nsec - delta; | |
320 | if (delta) { | |
321 | mismatches++; | |
322 | if (delta > 1) | |
323 | bad_mismatches++; | |
324 | if (delta > max_mismatch) | |
325 | max_mismatch = delta; | |
326 | } | |
327 | } | |
328 | if (!bad_mismatches) | |
329 | assert(max_mismatch == 0 || max_mismatch == 1); | |
330 | if (!mismatches) | |
331 | assert(max_mismatch == 0); | |
332 | } | |
333 | ||
334 | printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n", | |
335 | mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start, | |
336 | start + len - 1); | |
337 | return mismatches; | |
338 | } | |
339 | ||
340 | #define MIN_TICKS 1ULL | |
341 | #define LEN 1000000000ULL | |
342 | #define NSEC_ONE_SEC 1000000000ULL | |
343 | #define TESTLEN 9 | |
344 | long long test_clock(int mode, int cycles_per_usec, int fast_test, int quiet, int delta_ticks, int delta_nsec) | |
345 | { | |
346 | int i; | |
347 | long long delta; | |
348 | unsigned long long max_ticks; | |
349 | unsigned long long nsecs; | |
350 | void *mult; | |
351 | unsigned long long test_ns[TESTLEN] = | |
352 | {NSEC_ONE_SEC, NSEC_ONE_SEC, | |
353 | NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60, | |
354 | NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4, | |
355 | NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24}; | |
356 | unsigned long long test_ticks[TESTLEN]; | |
357 | ||
358 | max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL; | |
359 | ||
360 | switch(mode) { | |
361 | case CLOCK_64_MULT_SHIFT: { | |
362 | mult = &clock_mult; | |
363 | break; | |
364 | } | |
365 | case CLOCK_64_EMULATE_128: { | |
366 | mult = clock_mult64_128; | |
367 | break; | |
368 | } | |
369 | case CLOCK_64_2STAGE: { | |
370 | mult = &clock_mult; | |
371 | break; | |
372 | } | |
373 | case CLOCK_128_MULT_SHIFT: { | |
374 | mult = &clock_mult128; | |
375 | break; | |
376 | } | |
377 | } | |
378 | calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec); | |
379 | nsecs = get_nsec(mode, max_ticks); | |
380 | delta = nsecs/1000000 - MAX_CLOCK_SEC*1000; | |
381 | ||
382 | if (mode == CLOCK_64_2STAGE) { | |
383 | test_ns[0] = nsecs_for_max_cycles - 1; | |
384 | test_ns[1] = nsecs_for_max_cycles; | |
385 | test_ticks[0] = (1ULL << max_cycles_shift) - 1; | |
386 | test_ticks[1] = (1ULL << max_cycles_shift); | |
387 | ||
388 | for (i = 2; i < TESTLEN; i++) | |
389 | test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec; | |
390 | } | |
391 | else { | |
392 | for (i = 0; i < TESTLEN; i++) | |
393 | test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec; | |
394 | } | |
395 | ||
396 | if (!quiet) { | |
397 | printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n", | |
398 | cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift)); | |
399 | switch(mode) { | |
400 | case CLOCK_64_2STAGE: | |
401 | case CLOCK_64_MULT_SHIFT: { | |
402 | printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n", | |
403 | clock_mult, (double) clock_mult / (1ULL << clock_shift)); | |
404 | break; | |
405 | } | |
406 | case CLOCK_64_EMULATE_128: { | |
407 | printf("clock_mult=0x%016llx%016llx\n", | |
408 | clock_mult64_128[1], clock_mult64_128[0]); | |
409 | break; | |
410 | } | |
411 | case CLOCK_128_MULT_SHIFT: { | |
412 | printf("clock_mult=0x%016llx%016llx\n", | |
413 | (unsigned long long) (clock_mult128 >> 64), | |
414 | (unsigned long long) clock_mult128); | |
415 | break; | |
416 | } | |
417 | } | |
418 | printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n", | |
419 | nsecs, MAX_CLOCK_SEC*1000000000ULL, delta); | |
420 | } | |
421 | ||
422 | for (i = 0; i < TESTLEN; i++) | |
423 | { | |
424 | nsecs = get_nsec(mode, test_ticks[i]); | |
425 | delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs; | |
426 | if (!quiet || delta > 0) | |
427 | printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n", | |
428 | test_ticks[i], nsecs, test_ns[i], delta); | |
429 | } | |
430 | ||
431 | if (!fast_test) { | |
432 | discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN); | |
433 | discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN); | |
434 | } | |
435 | ||
436 | if (!quiet) | |
437 | printf("\n\n"); | |
438 | ||
439 | return delta; | |
440 | } | |
441 | ||
442 | int main(int argc, char *argv[]) | |
443 | { | |
444 | int i, days; | |
445 | long long error; | |
446 | long long errors[10001]; | |
447 | double mean; | |
448 | ||
449 | nsecs = malloc(LEN * sizeof(unsigned long long)); | |
450 | assert(nsecs != NULL); | |
451 | days = MAX_CLOCK_SEC / 60 / 60 / 24; | |
452 | ||
453 | test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 0, 0); | |
454 | // test_clock(CLOCK_64_MULT_SHIFT, 3333, 1, 0, 0, 0); | |
455 | // test_clock(CLOCK_128_MULT_SHIFT, 3333, 1, 0, 0, 0); | |
456 | ||
457 | // Test 3 different clock types from 1000 to 10000 MHz | |
458 | // and calculate average error | |
459 | /* | |
460 | for (i = 1000, mean = 0.0; i <= 10000; i++) { | |
461 | error = test_clock(CLOCK_64_MULT_SHIFT, i, 1, 1, 0, 0); | |
462 | errors[i] = error > 0 ? error : -1LL * error; | |
463 | mean += (double) errors[i] / 9000; | |
464 | } | |
465 | printf(" 64-bit average error per %d days: %fms\n", days, mean); | |
466 | ||
467 | for (i = 1000, mean = 0.0; i <= 10000; i++) { | |
468 | error = test_clock(CLOCK_64_2STAGE, i, 1, 1, 0, 0); | |
469 | errors[i] = error > 0 ? error : -1LL * error; | |
470 | mean += (double) errors[i] / 9000; | |
471 | } | |
472 | printf(" 64-bit two-stage average error per %d days: %fms\n", days, mean); | |
473 | ||
474 | for (i = 1000, mean = 0.0; i <= 10000; i++) { | |
475 | error = test_clock(CLOCK_128_MULT_SHIFT, i, 1, 1, 0, 0); | |
476 | errors[i] = error > 0 ? error : -1LL * error; | |
477 | mean += (double) errors[i] / 9000; | |
478 | } | |
479 | printf(" 128-bit average error per %d days: %fms\n", days, mean); | |
480 | */ | |
481 | test_clock(CLOCK_64_2STAGE, 1000, 1, 0, 1, 1); | |
482 | test_clock(CLOCK_64_2STAGE, 1100, 1, 0, 11, 10); | |
483 | test_clock(CLOCK_64_2STAGE, 3000, 1, 0, 3, 1); | |
484 | test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 3333, 1000); | |
485 | test_clock(CLOCK_64_2STAGE, 3392, 1, 0, 424, 125); | |
486 | test_clock(CLOCK_64_2STAGE, 4500, 1, 0, 9, 2); | |
487 | test_clock(CLOCK_64_2STAGE, 5000, 1, 0, 5, 1); | |
488 | ||
489 | free(nsecs); | |
490 | return 0; | |
491 | } |