From: Vincent Fu Date: Tue, 11 Apr 2017 19:57:15 +0000 (-0400) Subject: nanosecond: add test program t/time-test for experimenting with cpu clock ticks to... X-Git-Tag: fio-2.99~72^2~1^2~5 X-Git-Url: https://git.kernel.dk/?p=fio.git;a=commitdiff_plain;h=dcbc78414b244a313d46eb61cb34f14f2e9988bd;hp=6d02b37b62ba43630bdf6ddebead5bf474daf40c nanosecond: add test program t/time-test for experimenting with cpu clock ticks to nsec conversion --- diff --git a/Makefile b/Makefile index d7786d2b..64fa97a7 100644 --- a/Makefile +++ b/Makefile @@ -250,6 +250,9 @@ T_PIPE_ASYNC_PROGS = t/read-to-pipe-async T_MEMLOCK_OBJS = t/memlock.o T_MEMLOCK_PROGS = t/memlock +T_TT_OBJS = t/time-test.o +T_TT_PROGS = t/time-test + T_OBJS = $(T_SMALLOC_OBJS) T_OBJS += $(T_IEEE_OBJS) T_OBJS += $(T_ZIPF_OBJS) @@ -261,6 +264,7 @@ T_OBJS += $(T_DEDUPE_OBJS) T_OBJS += $(T_VS_OBJS) T_OBJS += $(T_PIPE_ASYNC_OBJS) T_OBJS += $(T_MEMLOCK_OBJS) +T_OBJS += $(T_TT_OBJS) ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS))) T_DEDUPE_OBJS += os/windows/posix.o lib/hweight.o @@ -434,6 +438,9 @@ t/fio-dedupe: $(T_DEDUPE_OBJS) t/fio-verify-state: $(T_VS_OBJS) $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_VS_OBJS) $(LIBS) +t/time-test: $(T_TT_OBJS) + $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_TT_OBJS) $(LIBS) + clean: FORCE @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio FIO-VERSION-FILE *.d lib/*.d oslib/*.d crc/*.d engines/*.d profiles/*.d t/*.d config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h @rm -rf doc/output diff --git a/t/time-test.c b/t/time-test.c new file mode 100644 index 00000000..831882b8 --- /dev/null +++ b/t/time-test.c @@ -0,0 +1,491 @@ +/* + * Carry out arithmetic to explore conversion of CPU clock ticks to nsec + * + * When we use the CPU clock for timing, we do the following: + * + * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks + * to actual time. + * + * Using gettimeofday() or clock_gettime(), count how many CPU clock + * ticks occur per usec + * + * 2) Calculate conversion factors so that we can ultimately convert + * from clocks ticks to nsec with + * nsec = (ticks * clock_mult) >> clock_shift + * + * This is equivalent to + * nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER + * where + * clock_mult = MULTIPLIER / cycles_per_nsec + * MULTIPLIER = 2^clock_shift + * + * It would be simpler to just calculate nsec = ticks / cycles_per_nsec, + * but all of this is necessary because of rounding when calculating + * cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply + * be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional + * portion is lost with integer arithmetic. + * + * This multiply and shift calculation also has a performance benefit + * as multiplication and bit shift operations are faster than integer + * division. + * + * 3) Dynamically determine clock_shift and clock_mult at run time based + * on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum + * duration for which the conversion will be valid. + * + * The primary constraint is that (ticks * clock_mult) must not overflow + * when ticks is at its maximum value. + * + * So we have + * max_ticks * clock_mult <= ULLONG_MAX + * max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX + * MULTIPLIER <= ULLONG_MAX / max_ticks * cycles_per_nsec + * + * Then choose the largest clock_shift that satisfies + * 2^clock_shift <= ULLONG_MAX / max_ticks * cycles_per_nsec + * + * Finally calculate the appropriate clock_mult associated with clock_shift + * clock_mult = 2^clock_shift / cycles_per_nsec + * + * 4) In the code below we have cycles_per_usec and use + * cycles_per_nsec = cycles_per_usec / 1000 + * + */ + +#include +#include +#include +#include +#include + +#define DEBUG 0 +#define MAX_CLOCK_SEC 365*24*60*60ULL +#define MAX_CLOCK_SEC64 60*60ULL +#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); } + +enum { + __CLOCK_64_BIT = 1 << 0, + __CLOCK_128_BIT = 1 << 1, + __CLOCK_MULT_SHIFT = 1 << 2, + __CLOCK_EMULATE_128 = 1 << 3, + __CLOCK_REDUCE = 1 << 4, + + CLOCK_64_MULT_SHIFT = __CLOCK_64_BIT | __CLOCK_MULT_SHIFT, + CLOCK_64_EMULATE_128 = __CLOCK_64_BIT | __CLOCK_EMULATE_128, + CLOCK_64_2STAGE = __CLOCK_64_BIT | __CLOCK_REDUCE, + CLOCK_128_MULT_SHIFT = __CLOCK_128_BIT | __CLOCK_MULT_SHIFT, +}; + +unsigned int clock_shift; +unsigned int max_cycles_shift; +unsigned long long max_cycles_mask; +unsigned long long *nsecs; +unsigned long long clock_mult; +unsigned long long nsecs_for_max_cycles; +unsigned long long clock_mult64_128[2]; +__uint128_t clock_mult128; + + +/* + * Functions for carrying out 128-bit + * arithmetic using 64-bit integers + * + * 128-bit integers are stored as + * arrays of two 64-bit integers + * + * Ordering is little endian + * + * a[0] has the less significant bits + * a[1] has the more significant bits + */ +void do_mult(unsigned long long a[2], unsigned long long b, unsigned long long product[2]) +{ + product[0] = product[1] = 0; + return; +} + +void do_div(unsigned long long a[2], unsigned long long b, unsigned long long c[2]) +{ + return; +} + +void do_shift64(unsigned long long a[2], unsigned int count) +{ + a[0] = a[1] >> (count-64); + a[1] = 0; +} + +void do_shift(unsigned long long a[2], unsigned int count) +{ + if (count > 64) + do_shift64(a, count); + else + while (count--) { + a[0] >>= 1; + a[0] |= a[1] << 63; + a[1] >>= 1; + } +} + +unsigned long long get_nsec(int mode, unsigned long long t) +{ + switch(mode) { + case CLOCK_64_MULT_SHIFT: { + return (t * clock_mult) >> clock_shift; + } + case CLOCK_64_EMULATE_128: { + unsigned long long product[2]; + do_mult(clock_mult64_128, t, product); + do_shift(product, clock_shift); + return product[0]; + } + case CLOCK_64_2STAGE: { + unsigned long long multiples, nsec; + multiples = t >> max_cycles_shift; + dprintf("multiples=%llu\n", multiples); + nsec = multiples * nsecs_for_max_cycles; + nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift; + return nsec; + } + case CLOCK_128_MULT_SHIFT: { + return (unsigned long long)((t * clock_mult128) >> clock_shift); + } + default: { + assert(0); + } + } +} + +void calc_mult_shift(int mode, void *mult, unsigned int *shift, unsigned long long max_sec, unsigned long long cycles_per_usec) +{ + unsigned long long max_ticks; + max_ticks = max_sec * cycles_per_usec * 1000000ULL; + + switch (mode) { + case CLOCK_64_MULT_SHIFT: { + unsigned long long max_mult, tmp; + unsigned int sft = 0; + + /* + * Calculate the largest multiplier that will not + * produce a 64-bit overflow in the multiplication + * step of the clock ticks to nsec conversion + */ + max_mult = ULLONG_MAX / max_ticks; + dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult); + + /* + * Find the largest shift count that will produce + * a multiplier less than max_mult + */ + tmp = max_mult * cycles_per_usec / 1000; + while (tmp > 1) { + tmp >>= 1; + sft++; + dprintf("tmp=%llu, sft=%u\n", tmp, sft); + } + + *shift = sft; + *((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec); + break; + } + case CLOCK_64_EMULATE_128: { + unsigned long long max_mult[2], tmp[2]; + unsigned int sft = 0; + + /* + * Calculate the largest multiplier that will not + * produce a 128-bit overflow in the multiplication + * step of the clock ticks to nsec conversion, + * but use only 64-bit integers in the process + */ + max_mult[0] = max_mult[1] = ULLONG_MAX; + do_div(max_mult, max_ticks, max_mult); + dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n", + max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]); + + /* + * Find the largest shift count that will produce + * a multiplier less than max_mult + */ + do_div(max_mult, cycles_per_usec, tmp); + do_div(tmp, 1000ULL, tmp); + while (tmp[0] > 1 || tmp[1] > 1) { + do_shift(tmp, 1); + sft++; + dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft); + } + + *shift = sft; +// *((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec); + break; + } + case CLOCK_64_2STAGE: { + unsigned long long tmp; +/* + * This clock tick to nsec conversion requires two stages. + * + * Stage 1: Determine how many ~MAX_CLOCK_SEC64 periods worth of clock ticks + * have elapsed and set nsecs to the appropriate value for those + * ~MAX_CLOCK_SEC64 periods. + * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC64 periods from + * Stage 1. Convert remaining clock ticks to nsecs and add to previously + * set nsec value. + * + * To optimize the arithmetic operations, use the greatest power of 2 ticks + * less than the number of ticks in MAX_CLOCK_SEC64 seconds. + * + */ + // Use a period shorter than MAX_CLOCK_SEC here for better accuracy + calc_mult_shift(CLOCK_64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC64, cycles_per_usec); + + // Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC64 + max_cycles_shift = max_cycles_mask = 0; + tmp = MAX_CLOCK_SEC64 * 1000000ULL * cycles_per_usec; + dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift); + while (tmp > 1) { + tmp >>= 1; + max_cycles_shift++; + dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift); + } + // if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will + // have a discontinuity every (1ULL << max_cycles_shift) cycles + nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift; + + // Use a bitmask to calculate ticks % (1ULL << max_cycles_shift) + for (tmp = 0; tmp < max_cycles_shift; tmp++) + max_cycles_mask |= 1ULL << tmp; + + dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n", + max_cycles_shift, (1ULL << max_cycles_shift), + nsecs_for_max_cycles, max_cycles_mask); + + + break; + } + case CLOCK_128_MULT_SHIFT: { + __uint128_t max_mult, tmp; + unsigned int sft = 0; + + /* + * Calculate the largest multiplier that will not + * produce a 128-bit overflow in the multiplication + * step of the clock ticks to nsec conversion + */ + max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX; + max_mult /= max_ticks; + dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n", + max_ticks, __builtin_clzll(max_ticks), + (unsigned long long) (max_mult >> 64), + (unsigned long long) max_mult); + + /* + * Find the largest shift count that will produce + * a multiplier less than max_mult + */ + tmp = max_mult * cycles_per_usec / 1000; + while (tmp > 1) { + tmp >>= 1; + sft++; + dprintf("tmp=0x%016llx%016llx, sft=%u\n", + (unsigned long long) (tmp >> 64), + (unsigned long long) tmp, sft); + } + + *shift = sft; + *((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec); + break; + } + } +} + +int discontinuity(int mode, int delta_ticks, int delta_nsec, unsigned long long start, unsigned long len) +{ + int i; + unsigned long mismatches = 0, bad_mismatches = 0; + unsigned long long delta, max_mismatch = 0; + unsigned long long *ns = nsecs; + + for (i = 0; i < len; ns++, i++) { + *ns = get_nsec(mode, start + i); + if (i - delta_ticks >= 0) { + if (*ns > *(ns - delta_ticks)) + delta = *ns - *(ns - delta_ticks); + else + delta = *(ns - delta_ticks) - *ns; + if (delta > delta_nsec) + delta -= delta_nsec; + else + delta = delta_nsec - delta; + if (delta) { + mismatches++; + if (delta > 1) + bad_mismatches++; + if (delta > max_mismatch) + max_mismatch = delta; + } + } + if (!bad_mismatches) + assert(max_mismatch == 0 || max_mismatch == 1); + if (!mismatches) + assert(max_mismatch == 0); + } + + printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n", + mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start, + start + len - 1); + return mismatches; +} + +#define MIN_TICKS 1ULL +#define LEN 1000000000ULL +#define NSEC_ONE_SEC 1000000000ULL +#define TESTLEN 9 +long long test_clock(int mode, int cycles_per_usec, int fast_test, int quiet, int delta_ticks, int delta_nsec) +{ + int i; + long long delta; + unsigned long long max_ticks; + unsigned long long nsecs; + void *mult; + unsigned long long test_ns[TESTLEN] = + {NSEC_ONE_SEC, NSEC_ONE_SEC, + NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60, + NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4, + NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24}; + unsigned long long test_ticks[TESTLEN]; + + max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL; + + switch(mode) { + case CLOCK_64_MULT_SHIFT: { + mult = &clock_mult; + break; + } + case CLOCK_64_EMULATE_128: { + mult = clock_mult64_128; + break; + } + case CLOCK_64_2STAGE: { + mult = &clock_mult; + break; + } + case CLOCK_128_MULT_SHIFT: { + mult = &clock_mult128; + break; + } + } + calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec); + nsecs = get_nsec(mode, max_ticks); + delta = nsecs/1000000 - MAX_CLOCK_SEC*1000; + + if (mode == CLOCK_64_2STAGE) { + test_ns[0] = nsecs_for_max_cycles - 1; + test_ns[1] = nsecs_for_max_cycles; + test_ticks[0] = (1ULL << max_cycles_shift) - 1; + test_ticks[1] = (1ULL << max_cycles_shift); + + for (i = 2; i < TESTLEN; i++) + test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec; + } + else { + for (i = 0; i < TESTLEN; i++) + test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec; + } + + if (!quiet) { + printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n", + cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift)); + switch(mode) { + case CLOCK_64_2STAGE: + case CLOCK_64_MULT_SHIFT: { + printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n", + clock_mult, (double) clock_mult / (1ULL << clock_shift)); + break; + } + case CLOCK_64_EMULATE_128: { + printf("clock_mult=0x%016llx%016llx\n", + clock_mult64_128[1], clock_mult64_128[0]); + break; + } + case CLOCK_128_MULT_SHIFT: { + printf("clock_mult=0x%016llx%016llx\n", + (unsigned long long) (clock_mult128 >> 64), + (unsigned long long) clock_mult128); + break; + } + } + printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n", + nsecs, MAX_CLOCK_SEC*1000000000ULL, delta); + } + + for (i = 0; i < TESTLEN; i++) + { + nsecs = get_nsec(mode, test_ticks[i]); + delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs; + if (!quiet || delta > 0) + printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n", + test_ticks[i], nsecs, test_ns[i], delta); + } + + if (!fast_test) { + discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN); + discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN); + } + + if (!quiet) + printf("\n\n"); + + return delta; +} + +int main(int argc, char *argv[]) +{ + int i, days; + long long error; + long long errors[10001]; + double mean; + + nsecs = malloc(LEN * sizeof(unsigned long long)); + assert(nsecs != NULL); + days = MAX_CLOCK_SEC / 60 / 60 / 24; + + test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 0, 0); +// test_clock(CLOCK_64_MULT_SHIFT, 3333, 1, 0, 0, 0); +// test_clock(CLOCK_128_MULT_SHIFT, 3333, 1, 0, 0, 0); + +// Test 3 different clock types from 1000 to 10000 MHz +// and calculate average error +/* + for (i = 1000, mean = 0.0; i <= 10000; i++) { + error = test_clock(CLOCK_64_MULT_SHIFT, i, 1, 1, 0, 0); + errors[i] = error > 0 ? error : -1LL * error; + mean += (double) errors[i] / 9000; + } + printf(" 64-bit average error per %d days: %fms\n", days, mean); + + for (i = 1000, mean = 0.0; i <= 10000; i++) { + error = test_clock(CLOCK_64_2STAGE, i, 1, 1, 0, 0); + errors[i] = error > 0 ? error : -1LL * error; + mean += (double) errors[i] / 9000; + } + printf(" 64-bit two-stage average error per %d days: %fms\n", days, mean); + + for (i = 1000, mean = 0.0; i <= 10000; i++) { + error = test_clock(CLOCK_128_MULT_SHIFT, i, 1, 1, 0, 0); + errors[i] = error > 0 ? error : -1LL * error; + mean += (double) errors[i] / 9000; + } + printf(" 128-bit average error per %d days: %fms\n", days, mean); +*/ + test_clock(CLOCK_64_2STAGE, 1000, 1, 0, 1, 1); + test_clock(CLOCK_64_2STAGE, 1100, 1, 0, 11, 10); + test_clock(CLOCK_64_2STAGE, 3000, 1, 0, 3, 1); + test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 3333, 1000); + test_clock(CLOCK_64_2STAGE, 3392, 1, 0, 424, 125); + test_clock(CLOCK_64_2STAGE, 4500, 1, 0, 9, 2); + test_clock(CLOCK_64_2STAGE, 5000, 1, 0, 5, 1); + + free(nsecs); + return 0; +}