[fio.git] / t / time-test.c

/*
 * Carry out arithmetic to explore conversion of CPU clock ticks to nsec
 *
 * When we use the CPU clock for timing, we do the following:
 *
 * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks
 *    to actual time.
 *
 *    Using gettimeofday() or clock_gettime(), count how many CPU clock
 *    ticks occur per usec
 *
 * 2) Calculate conversion factors so that we can ultimately convert
 *    from clocks ticks to nsec with
 *      nsec = (ticks * clock_mult) >> clock_shift
 *
 *    This is equivalent to
 *	nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER
 *    where
 *	clock_mult = MULTIPLIER / cycles_per_nsec
 *      MULTIPLIER = 2^clock_shift
 *
 *    It would be simpler to just calculate nsec = ticks / cycles_per_nsec,
 *    but all of this is necessary because of rounding when calculating
 *    cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply
 *    be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional
 *    portion is lost with integer arithmetic.
 *
 *    This multiply and shift calculation also has a performance benefit
 *    as multiplication and bit shift operations are faster than integer
 *    division.
 *
 * 3) Dynamically determine clock_shift and clock_mult at run time based
 *    on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum
 *    duration for which the conversion will be valid.
 *
 *    The primary constraint is that (ticks * clock_mult) must not overflow
 *    when ticks is at its maximum value.
 *
 *    So we have
 *	max_ticks * clock_mult <= ULLONG_MAX
 *	max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX
 *      MULTIPLIER <= ULLONG_MAX / max_ticks * cycles_per_nsec
 *
 *    Then choose the largest clock_shift that satisfies
 *	2^clock_shift <= ULLONG_MAX / max_ticks * cycles_per_nsec
 *
 *    Finally calculate the appropriate clock_mult associated with clock_shift
 *	clock_mult = 2^clock_shift / cycles_per_nsec
 *
 * 4) In the code below we have cycles_per_usec and use
 *	cycles_per_nsec = cycles_per_usec / 1000
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <assert.h>
#include <stdlib.h>

#define DEBUG 0
#define MAX_CLOCK_SEC 365*24*60*60ULL
#define MAX_CLOCK_SEC64 60*60ULL
#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); }

enum {
	__CLOCK_64_BIT		= 1 << 0,
	__CLOCK_128_BIT		= 1 << 1,
	__CLOCK_MULT_SHIFT	= 1 << 2,
	__CLOCK_EMULATE_128	= 1 << 3,
	__CLOCK_REDUCE		= 1 << 4,

	CLOCK_64_MULT_SHIFT	= __CLOCK_64_BIT | __CLOCK_MULT_SHIFT,
	CLOCK_64_EMULATE_128	= __CLOCK_64_BIT | __CLOCK_EMULATE_128,
	CLOCK_64_2STAGE		= __CLOCK_64_BIT | __CLOCK_REDUCE,
	CLOCK_128_MULT_SHIFT	= __CLOCK_128_BIT | __CLOCK_MULT_SHIFT,
};

unsigned int clock_shift;
unsigned int max_cycles_shift;
unsigned long long max_cycles_mask;
unsigned long long *nsecs;
unsigned long long clock_mult;
unsigned long long nsecs_for_max_cycles;
unsigned long long clock_mult64_128[2];
__uint128_t clock_mult128;


/*
 * Functions for carrying out 128-bit
 * arithmetic using 64-bit integers
 *
 * 128-bit integers are stored as
 * arrays of two 64-bit integers
 *
 * Ordering is little endian
 *
 * a[0] has the less significant bits
 * a[1] has the more significant bits
 */
void do_mult(unsigned long long a[2], unsigned long long b, unsigned long long product[2])
{
	product[0] = product[1] = 0;
	return;
}

void do_div(unsigned long long a[2], unsigned long long b, unsigned long long c[2])
{
	return;
}

void do_shift64(unsigned long long a[2], unsigned int count)
{
	a[0] = a[1] >> (count-64);
	a[1] = 0;
}

void do_shift(unsigned long long a[2], unsigned int count)
{
	if (count > 64)
		do_shift64(a, count);
	else
		while (count--) {
			a[0] >>= 1;
			a[0] |= a[1] << 63;
			a[1] >>= 1;
		}
}

unsigned long long get_nsec(int mode, unsigned long long t)
{
	switch(mode) {
		case CLOCK_64_MULT_SHIFT: {
			return (t * clock_mult) >> clock_shift;
		}
		case CLOCK_64_EMULATE_128: {
			unsigned long long product[2];
			do_mult(clock_mult64_128, t, product);
			do_shift(product, clock_shift);
			return product[0];
		}
		case CLOCK_64_2STAGE: {
			unsigned long long multiples, nsec;
			multiples = t >> max_cycles_shift;
			dprintf("multiples=%llu\n", multiples);
			nsec = multiples * nsecs_for_max_cycles;
			nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
			return nsec;
		}
		case CLOCK_128_MULT_SHIFT: {
			return (unsigned long long)((t * clock_mult128) >> clock_shift);
		}
		default: {
			assert(0);
		}
	}
}

void calc_mult_shift(int mode, void *mult, unsigned int *shift, unsigned long long max_sec, unsigned long long cycles_per_usec)
{
	unsigned long long max_ticks;
	max_ticks = max_sec * cycles_per_usec * 1000000ULL;

	switch (mode) {
		case CLOCK_64_MULT_SHIFT: {
			unsigned long long max_mult, tmp;
			unsigned int sft = 0;

			/*
			 * Calculate the largest multiplier that will not
			 * produce a 64-bit overflow in the multiplication
			 * step of the clock ticks to nsec conversion
			 */
			max_mult = ULLONG_MAX / max_ticks;
			dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult);

			/*
			 * Find the largest shift count that will produce
			 * a multiplier less than max_mult
			 */
			tmp = max_mult * cycles_per_usec / 1000;
			while (tmp > 1) {
				tmp >>= 1;
				sft++;
				dprintf("tmp=%llu, sft=%u\n", tmp, sft);
			}

			*shift = sft;
			*((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec);
			break;
		}
		case CLOCK_64_EMULATE_128: {
			unsigned long long max_mult[2], tmp[2];
			unsigned int sft = 0;

			/*
			 * Calculate the largest multiplier that will not
			 * produce a 128-bit overflow in the multiplication
			 * step of the clock ticks to nsec conversion,
			 * but use only 64-bit integers in the process
			 */
			max_mult[0] = max_mult[1] = ULLONG_MAX;
			do_div(max_mult, max_ticks, max_mult);
			dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
				max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]);

			/*
			 * Find the largest shift count that will produce
			 * a multiplier less than max_mult
			 */
			do_div(max_mult, cycles_per_usec, tmp);
			do_div(tmp, 1000ULL, tmp);
			while (tmp[0] > 1 || tmp[1] > 1) {
				do_shift(tmp, 1);
				sft++;
				dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft);
			}

			*shift = sft;
//			*((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
			break;
		}
		case CLOCK_64_2STAGE: {
			unsigned long long tmp;
/*
 * This clock tick to nsec conversion requires two stages.
 *
 * Stage 1: Determine how many ~MAX_CLOCK_SEC64 periods worth of clock ticks
 * 	have elapsed and set nsecs to the appropriate value for those
 *	~MAX_CLOCK_SEC64 periods.
 * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC64 periods from
 *	Stage 1. Convert remaining clock ticks to nsecs and add to previously
 *	set nsec value.
 *
 * To optimize the arithmetic operations, use the greatest power of 2 ticks
 * less than the number of ticks in MAX_CLOCK_SEC64 seconds.
 *
 */
			// Use a period shorter than MAX_CLOCK_SEC here for better accuracy
			calc_mult_shift(CLOCK_64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC64, cycles_per_usec);

			// Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC64
			max_cycles_shift = max_cycles_mask = 0;
			tmp = MAX_CLOCK_SEC64 * 1000000ULL * cycles_per_usec;
			dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
			while (tmp > 1) {
				tmp >>= 1;
				max_cycles_shift++;
				dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
			}
			// if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will
			// have a discontinuity every (1ULL << max_cycles_shift) cycles
			nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift;

			// Use a bitmask to calculate ticks % (1ULL << max_cycles_shift)
			for (tmp = 0; tmp < max_cycles_shift; tmp++)
				max_cycles_mask |= 1ULL << tmp;

			dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n",
				max_cycles_shift, (1ULL << max_cycles_shift),
				nsecs_for_max_cycles, max_cycles_mask);


			break;
		}
		case CLOCK_128_MULT_SHIFT: {
			__uint128_t max_mult, tmp;
			unsigned int sft = 0;

			/*
			 * Calculate the largest multiplier that will not
			 * produce a 128-bit overflow in the multiplication
			 * step of the clock ticks to nsec conversion
			 */
			max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX;
			max_mult /= max_ticks;
			dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
				max_ticks, __builtin_clzll(max_ticks),
				(unsigned long long) (max_mult >> 64),
				(unsigned long long) max_mult);

			/*
			 * Find the largest shift count that will produce
			 * a multiplier less than max_mult
			 */
			tmp = max_mult * cycles_per_usec / 1000;
			while (tmp > 1) {
				tmp >>= 1;
				sft++;
				dprintf("tmp=0x%016llx%016llx, sft=%u\n",
					(unsigned long long) (tmp >> 64),
					(unsigned long long) tmp, sft);
 			}

			*shift = sft;
			*((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
			break;
 		}
 	}
}

int discontinuity(int mode, int delta_ticks, int delta_nsec, unsigned long long start, unsigned long len)
{
	int i;
	unsigned long mismatches = 0, bad_mismatches = 0;
	unsigned long long delta, max_mismatch = 0;
	unsigned long long *ns = nsecs;

	for (i = 0; i < len; ns++, i++) {
		*ns = get_nsec(mode, start + i);
		if (i - delta_ticks >= 0) {
			if (*ns > *(ns - delta_ticks))
				delta = *ns - *(ns - delta_ticks);
			else
				delta = *(ns - delta_ticks) - *ns;
			if (delta > delta_nsec)
				delta -= delta_nsec;
			else
				delta = delta_nsec - delta;
			if (delta) {
				mismatches++;
				if (delta > 1)
					bad_mismatches++;
				if (delta > max_mismatch)
					max_mismatch = delta;
			}
		}
		if (!bad_mismatches)
			assert(max_mismatch == 0 || max_mismatch == 1);
		if (!mismatches)
			assert(max_mismatch == 0);
	}

	printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n",
		mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start,
		start + len - 1);
	return mismatches;
}

#define MIN_TICKS 1ULL
#define LEN 1000000000ULL
#define NSEC_ONE_SEC 1000000000ULL
#define TESTLEN 9
long long test_clock(int mode, int cycles_per_usec, int fast_test, int quiet, int delta_ticks, int delta_nsec)
{
	int i;
	long long delta;
	unsigned long long max_ticks;
	unsigned long long nsecs;
	void *mult;
	unsigned long long test_ns[TESTLEN] =
			{NSEC_ONE_SEC, NSEC_ONE_SEC,
			 NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60,
			 NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4,
			 NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24};
	unsigned long long test_ticks[TESTLEN];

	max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL;

	switch(mode) {
		case CLOCK_64_MULT_SHIFT: {
			mult = &clock_mult;
			break;
		}
		case CLOCK_64_EMULATE_128: {
			mult = clock_mult64_128;
			break;
		}
		case CLOCK_64_2STAGE: {
			mult = &clock_mult;
			break;
		}
		case CLOCK_128_MULT_SHIFT: {
			mult = &clock_mult128;
			break;
		}
	}
	calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec);
	nsecs = get_nsec(mode, max_ticks);
	delta = nsecs/1000000 - MAX_CLOCK_SEC*1000;

	if (mode == CLOCK_64_2STAGE) {
		test_ns[0] = nsecs_for_max_cycles - 1;
		test_ns[1] = nsecs_for_max_cycles;
		test_ticks[0] = (1ULL << max_cycles_shift) - 1;
		test_ticks[1] = (1ULL << max_cycles_shift);

		for (i = 2; i < TESTLEN; i++)
			test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
	}
	else {
		for (i = 0; i < TESTLEN; i++)
			test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
	}

	if (!quiet) {
		printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n",
			cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift));
		switch(mode) {
			case CLOCK_64_2STAGE:
			case CLOCK_64_MULT_SHIFT: {
				printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n",
					clock_mult, (double) clock_mult / (1ULL << clock_shift));
				break;
			}
			case CLOCK_64_EMULATE_128: {
				printf("clock_mult=0x%016llx%016llx\n",
					clock_mult64_128[1], clock_mult64_128[0]);
				break;
			}
			case CLOCK_128_MULT_SHIFT: {
				printf("clock_mult=0x%016llx%016llx\n",
					(unsigned long long) (clock_mult128 >> 64),
					(unsigned long long) clock_mult128);
				break;
			}
		}
		printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n",
			nsecs, MAX_CLOCK_SEC*1000000000ULL, delta);
	}

	for (i = 0; i < TESTLEN; i++)
	{
		nsecs = get_nsec(mode, test_ticks[i]);
		delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs;
		if (!quiet || delta > 0)
			printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n",
				test_ticks[i], nsecs, test_ns[i], delta);
	}

	if (!fast_test) {
		discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN);
		discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN);
	}

	if (!quiet)
		printf("\n\n");

	return delta;
}

int main(int argc, char *argv[])
{
	int i, days;
	long long error;
	long long errors[10001];
	double mean;

	nsecs = malloc(LEN * sizeof(unsigned long long));
	assert(nsecs != NULL);
	days = MAX_CLOCK_SEC / 60 / 60 / 24;

	test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 0, 0);
//	test_clock(CLOCK_64_MULT_SHIFT, 3333, 1, 0, 0, 0);
//	test_clock(CLOCK_128_MULT_SHIFT, 3333, 1, 0, 0, 0);

// Test 3 different clock types from 1000 to 10000 MHz
// and calculate average error
/*
	for (i = 1000, mean = 0.0; i <= 10000; i++) {
		error = test_clock(CLOCK_64_MULT_SHIFT, i, 1, 1, 0, 0);
		errors[i] = error > 0 ? error : -1LL * error;
		mean += (double) errors[i] / 9000;
	}
	printf("  64-bit average error per %d days: %fms\n", days, mean);

	for (i = 1000, mean = 0.0; i <= 10000; i++) {
		error = test_clock(CLOCK_64_2STAGE, i, 1, 1, 0, 0);
		errors[i] = error > 0 ? error : -1LL * error;
		mean += (double) errors[i] / 9000;
	}
	printf("  64-bit two-stage average error per %d days: %fms\n", days, mean);

	for (i = 1000, mean = 0.0; i <= 10000; i++) {
		error = test_clock(CLOCK_128_MULT_SHIFT, i, 1, 1, 0, 0);
		errors[i] = error > 0 ? error : -1LL * error;
		mean += (double) errors[i] / 9000;
	}
	printf(" 128-bit average error per %d days: %fms\n", days, mean);
*/
	test_clock(CLOCK_64_2STAGE, 1000, 1, 0, 1, 1);
	test_clock(CLOCK_64_2STAGE, 1100, 1, 0, 11, 10);
	test_clock(CLOCK_64_2STAGE, 3000, 1, 0, 3, 1);
	test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 3333, 1000);
	test_clock(CLOCK_64_2STAGE, 3392, 1, 0, 424, 125);
	test_clock(CLOCK_64_2STAGE, 4500, 1, 0, 9, 2);
	test_clock(CLOCK_64_2STAGE, 5000, 1, 0, 5, 1);

	free(nsecs);
	return 0;
}
Commit	Line	Data
dcbc7841 VF	1	/*
	2	* Carry out arithmetic to explore conversion of CPU clock ticks to nsec
	3	*
	4	* When we use the CPU clock for timing, we do the following:
	5	*
	6	* 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks
	7	* to actual time.
	8	*
	9	* Using gettimeofday() or clock_gettime(), count how many CPU clock
	10	* ticks occur per usec
	11	*
	12	* 2) Calculate conversion factors so that we can ultimately convert
	13	* from clocks ticks to nsec with
	14	* nsec = (ticks * clock_mult) >> clock_shift
	15	*
	16	* This is equivalent to
	17	* nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER
	18	* where
	19	* clock_mult = MULTIPLIER / cycles_per_nsec
	20	* MULTIPLIER = 2^clock_shift
	21	*
	22	* It would be simpler to just calculate nsec = ticks / cycles_per_nsec,
	23	* but all of this is necessary because of rounding when calculating
	24	* cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply
	25	* be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional
	26	* portion is lost with integer arithmetic.
	27	*
	28	* This multiply and shift calculation also has a performance benefit
	29	* as multiplication and bit shift operations are faster than integer
	30	* division.
	31	*
	32	* 3) Dynamically determine clock_shift and clock_mult at run time based
	33	* on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum
	34	* duration for which the conversion will be valid.
	35	*
	36	* The primary constraint is that (ticks * clock_mult) must not overflow
	37	* when ticks is at its maximum value.
	38	*
	39	* So we have
	40	* max_ticks * clock_mult <= ULLONG_MAX
	41	* max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX
	42	* MULTIPLIER <= ULLONG_MAX / max_ticks * cycles_per_nsec
	43	*
	44	* Then choose the largest clock_shift that satisfies
	45	* 2^clock_shift <= ULLONG_MAX / max_ticks * cycles_per_nsec
	46	*
	47	* Finally calculate the appropriate clock_mult associated with clock_shift
	48	* clock_mult = 2^clock_shift / cycles_per_nsec
	49	*
	50	* 4) In the code below we have cycles_per_usec and use
	51	* cycles_per_nsec = cycles_per_usec / 1000
	52	*
	53	*/
	54
	55	#include <stdio.h>
	56	#include <stdlib.h>
	57	#include <limits.h>
	58	#include <assert.h>
	59	#include <stdlib.h>
	60
	61	#define DEBUG 0
	62	#define MAX_CLOCK_SEC 3652460*60ULL
	63	#define MAX_CLOCK_SEC64 60*60ULL
	64	#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); }
65
66	enum {
67	__CLOCK_64_BIT = 1 << 0,
68	__CLOCK_128_BIT = 1 << 1,
69	__CLOCK_MULT_SHIFT = 1 << 2,
70	__CLOCK_EMULATE_128 = 1 << 3,
71	__CLOCK_REDUCE = 1 << 4,
72
73	CLOCK_64_MULT_SHIFT = __CLOCK_64_BIT \| __CLOCK_MULT_SHIFT,
74	CLOCK_64_EMULATE_128 = __CLOCK_64_BIT \| __CLOCK_EMULATE_128,
75	CLOCK_64_2STAGE = __CLOCK_64_BIT \| __CLOCK_REDUCE,
76	CLOCK_128_MULT_SHIFT = __CLOCK_128_BIT \| __CLOCK_MULT_SHIFT,
77	};
78
79	unsigned int clock_shift;
80	unsigned int max_cycles_shift;
81	unsigned long long max_cycles_mask;
82	unsigned long long *nsecs;
83	unsigned long long clock_mult;
84	unsigned long long nsecs_for_max_cycles;
85	unsigned long long clock_mult64_128[2];
86	__uint128_t clock_mult128;
87
88
89	/*
90	* Functions for carrying out 128-bit
91	* arithmetic using 64-bit integers
92	*
93	* 128-bit integers are stored as
94	* arrays of two 64-bit integers
95	*
96	* Ordering is little endian
97	*
98	* a[0] has the less significant bits
99	* a[1] has the more significant bits
100	*/
101	void do_mult(unsigned long long a[2], unsigned long long b, unsigned long long product[2])
102	{
103	product[0] = product[1] = 0;
104	return;
105	}
106
107	void do_div(unsigned long long a[2], unsigned long long b, unsigned long long c[2])
108	{
109	return;
110	}
111
112	void do_shift64(unsigned long long a[2], unsigned int count)
113	{
114	a[0] = a[1] >> (count-64);
115	a[1] = 0;
116	}
117
118	void do_shift(unsigned long long a[2], unsigned int count)
119	{
120	if (count > 64)
121	do_shift64(a, count);
122	else
123	while (count--) {
124	a[0] >>= 1;
125	a[0] \|= a[1] << 63;
126	a[1] >>= 1;
127	}
128	}
129
130	unsigned long long get_nsec(int mode, unsigned long long t)
131	{
132	switch(mode) {
133	case CLOCK_64_MULT_SHIFT: {
134	return (t * clock_mult) >> clock_shift;
135	}
136	case CLOCK_64_EMULATE_128: {
137	unsigned long long product[2];
138	do_mult(clock_mult64_128, t, product);
139	do_shift(product, clock_shift);
140	return product[0];
141	}
142	case CLOCK_64_2STAGE: {
143	unsigned long long multiples, nsec;
144	multiples = t >> max_cycles_shift;
145	dprintf("multiples=%llu\n", multiples);
146	nsec = multiples * nsecs_for_max_cycles;
147	nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
148	return nsec;
149	}
150	case CLOCK_128_MULT_SHIFT: {
151	return (unsigned long long)((t * clock_mult128) >> clock_shift);
152	}
153	default: {
154	assert(0);
155	}
156	}
157	}
158
159	void calc_mult_shift(int mode, void mult, unsigned int shift, unsigned long long max_sec, unsigned long long cycles_per_usec)
160	{
161	unsigned long long max_ticks;
162	max_ticks = max_sec * cycles_per_usec * 1000000ULL;
163
164	switch (mode) {
165	case CLOCK_64_MULT_SHIFT: {
166	unsigned long long max_mult, tmp;
167	unsigned int sft = 0;
168
169	/*
170	* Calculate the largest multiplier that will not
171	* produce a 64-bit overflow in the multiplication
172	* step of the clock ticks to nsec conversion
173	*/
174	max_mult = ULLONG_MAX / max_ticks;
175	dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult);
176
177	/*
178	* Find the largest shift count that will produce
179	* a multiplier less than max_mult
180	*/
181	tmp = max_mult * cycles_per_usec / 1000;
182	while (tmp > 1) {
183	tmp >>= 1;
184	sft++;
185	dprintf("tmp=%llu, sft=%u\n", tmp, sft);
186	}
187
188	*shift = sft;
189	((unsigned long long )mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec);
190	break;
191	}
192	case CLOCK_64_EMULATE_128: {
193	unsigned long long max_mult[2], tmp[2];
194	unsigned int sft = 0;
195
196	/*
197	* Calculate the largest multiplier that will not
198	* produce a 128-bit overflow in the multiplication
199	* step of the clock ticks to nsec conversion,
200	* but use only 64-bit integers in the process
201	*/
202	max_mult[0] = max_mult[1] = ULLONG_MAX;
203	do_div(max_mult, max_ticks, max_mult);
204	dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
205	max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]);
206
207	/*
208	* Find the largest shift count that will produce
209	* a multiplier less than max_mult
210	*/
211	do_div(max_mult, cycles_per_usec, tmp);
212	do_div(tmp, 1000ULL, tmp);
213	while (tmp[0] > 1 \|\| tmp[1] > 1) {
214	do_shift(tmp, 1);
215	sft++;
216	dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft);
217	}
218
219	*shift = sft;
220	// ((unsigned long long )mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
221	break;
222	}
223	case CLOCK_64_2STAGE: {
224	unsigned long long tmp;
225	/*
226	* This clock tick to nsec conversion requires two stages.
227	*
228	* Stage 1: Determine how many ~MAX_CLOCK_SEC64 periods worth of clock ticks
229	* have elapsed and set nsecs to the appropriate value for those
230	* ~MAX_CLOCK_SEC64 periods.
231	* Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC64 periods from
232	* Stage 1. Convert remaining clock ticks to nsecs and add to previously
233	* set nsec value.
234	*
235	* To optimize the arithmetic operations, use the greatest power of 2 ticks
236	* less than the number of ticks in MAX_CLOCK_SEC64 seconds.
237	*
238	*/
239	// Use a period shorter than MAX_CLOCK_SEC here for better accuracy
240	calc_mult_shift(CLOCK_64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC64, cycles_per_usec);
241
242	// Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC64
243	max_cycles_shift = max_cycles_mask = 0;
244	tmp = MAX_CLOCK_SEC64 * 1000000ULL * cycles_per_usec;
245	dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
246	while (tmp > 1) {
247	tmp >>= 1;
248	max_cycles_shift++;
249	dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
250	}
251	// if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will
252	// have a discontinuity every (1ULL << max_cycles_shift) cycles
253	nsecs_for_max_cycles = (1ULL << max_cycles_shift) * ((unsigned long long )mult) >> *shift;
254
255	// Use a bitmask to calculate ticks % (1ULL << max_cycles_shift)
256	for (tmp = 0; tmp < max_cycles_shift; tmp++)
257	max_cycles_mask \|= 1ULL << tmp;
258
259	dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n",
260	max_cycles_shift, (1ULL << max_cycles_shift),
261	nsecs_for_max_cycles, max_cycles_mask);
262
263
264	break;
265	}
266	case CLOCK_128_MULT_SHIFT: {
267	__uint128_t max_mult, tmp;
268	unsigned int sft = 0;
269
270	/*
271	* Calculate the largest multiplier that will not
272	* produce a 128-bit overflow in the multiplication
273	* step of the clock ticks to nsec conversion
274	*/
275	max_mult = ((__uint128_t) ULLONG_MAX) << 64 \| ULLONG_MAX;
276	max_mult /= max_ticks;
277	dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
278	max_ticks, __builtin_clzll(max_ticks),
279	(unsigned long long) (max_mult >> 64),
280	(unsigned long long) max_mult);
281
282	/*
283	* Find the largest shift count that will produce
284	* a multiplier less than max_mult
285	*/
286	tmp = max_mult * cycles_per_usec / 1000;
287	while (tmp > 1) {
288	tmp >>= 1;
289	sft++;
290	dprintf("tmp=0x%016llx%016llx, sft=%u\n",
291	(unsigned long long) (tmp >> 64),
292	(unsigned long long) tmp, sft);
293	}
294
295	*shift = sft;
296	((__uint128_t )mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
297	break;
298	}
299	}
300	}
301
302	int discontinuity(int mode, int delta_ticks, int delta_nsec, unsigned long long start, unsigned long len)
303	{
304	int i;
305	unsigned long mismatches = 0, bad_mismatches = 0;
306	unsigned long long delta, max_mismatch = 0;
307	unsigned long long *ns = nsecs;
308
309	for (i = 0; i < len; ns++, i++) {
310	*ns = get_nsec(mode, start + i);
311	if (i - delta_ticks >= 0) {
312	if (ns > (ns - delta_ticks))
313	delta = ns - (ns - delta_ticks);
314	else
315	delta = (ns - delta_ticks) - ns;
316	if (delta > delta_nsec)
317	delta -= delta_nsec;
318	else
319	delta = delta_nsec - delta;
320	if (delta) {
321	mismatches++;
322	if (delta > 1)
323	bad_mismatches++;
324	if (delta > max_mismatch)
325	max_mismatch = delta;
326	}
327	}
328	if (!bad_mismatches)
329	assert(max_mismatch == 0 \|\| max_mismatch == 1);
330	if (!mismatches)
331	assert(max_mismatch == 0);
332	}
333
334	printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n",
335	mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start,
336	start + len - 1);
337	return mismatches;
338	}
339
340	#define MIN_TICKS 1ULL
341	#define LEN 1000000000ULL
342	#define NSEC_ONE_SEC 1000000000ULL
343	#define TESTLEN 9
344	long long test_clock(int mode, int cycles_per_usec, int fast_test, int quiet, int delta_ticks, int delta_nsec)
345	{
346	int i;
347	long long delta;
348	unsigned long long max_ticks;
349	unsigned long long nsecs;
350	void *mult;
351	unsigned long long test_ns[TESTLEN] =
352	{NSEC_ONE_SEC, NSEC_ONE_SEC,
353	NSEC_ONE_SEC, NSEC_ONE_SEC60, NSEC_ONE_SEC60*60,
354	NSEC_ONE_SEC60602, NSEC_ONE_SEC60604,
355	NSEC_ONE_SEC60608, NSEC_ONE_SEC606024};
356	unsigned long long test_ticks[TESTLEN];
357
358	max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL;
359
360	switch(mode) {
361	case CLOCK_64_MULT_SHIFT: {
362	mult = &clock_mult;
363	break;
364	}
365	case CLOCK_64_EMULATE_128: {
366	mult = clock_mult64_128;
367	break;
368	}
369	case CLOCK_64_2STAGE: {
370	mult = &clock_mult;
371	break;
372	}
373	case CLOCK_128_MULT_SHIFT: {
374	mult = &clock_mult128;
375	break;
376	}
377	}
378	calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec);
379	nsecs = get_nsec(mode, max_ticks);
380	delta = nsecs/1000000 - MAX_CLOCK_SEC*1000;
381
382	if (mode == CLOCK_64_2STAGE) {
383	test_ns[0] = nsecs_for_max_cycles - 1;
384	test_ns[1] = nsecs_for_max_cycles;
385	test_ticks[0] = (1ULL << max_cycles_shift) - 1;
386	test_ticks[1] = (1ULL << max_cycles_shift);
387
388	for (i = 2; i < TESTLEN; i++)
389	test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
390	}
391	else {
392	for (i = 0; i < TESTLEN; i++)
393	test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
394	}
395
396	if (!quiet) {
397	printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n",
398	cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift));
399	switch(mode) {
400	case CLOCK_64_2STAGE:
401	case CLOCK_64_MULT_SHIFT: {
402	printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n",
403	clock_mult, (double) clock_mult / (1ULL << clock_shift));
404	break;
405	}
406	case CLOCK_64_EMULATE_128: {
407	printf("clock_mult=0x%016llx%016llx\n",
408	clock_mult64_128[1], clock_mult64_128[0]);
409	break;
410	}
411	case CLOCK_128_MULT_SHIFT: {
412	printf("clock_mult=0x%016llx%016llx\n",
413	(unsigned long long) (clock_mult128 >> 64),
414	(unsigned long long) clock_mult128);
415	break;
416	}
417	}
418	printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n",
419	nsecs, MAX_CLOCK_SEC*1000000000ULL, delta);
420	}
421
422	for (i = 0; i < TESTLEN; i++)
423	{
424	nsecs = get_nsec(mode, test_ticks[i]);
425	delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs;
426	if (!quiet \|\| delta > 0)
427	printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n",
428	test_ticks[i], nsecs, test_ns[i], delta);
429	}
430
431	if (!fast_test) {
432	discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN);
433	discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN);
434	}
435
436	if (!quiet)
437	printf("\n\n");
438
439	return delta;
440	}
441
442	int main(int argc, char *argv[])
443	{
444	int i, days;
445	long long error;
446	long long errors[10001];
447	double mean;
448
449	nsecs = malloc(LEN * sizeof(unsigned long long));
450	assert(nsecs != NULL);
451	days = MAX_CLOCK_SEC / 60 / 60 / 24;
452
453	test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 0, 0);
454	// test_clock(CLOCK_64_MULT_SHIFT, 3333, 1, 0, 0, 0);
455	// test_clock(CLOCK_128_MULT_SHIFT, 3333, 1, 0, 0, 0);
456
457	// Test 3 different clock types from 1000 to 10000 MHz
458	// and calculate average error
459	/*
460	for (i = 1000, mean = 0.0; i <= 10000; i++) {
461	error = test_clock(CLOCK_64_MULT_SHIFT, i, 1, 1, 0, 0);
462	errors[i] = error > 0 ? error : -1LL * error;
463	mean += (double) errors[i] / 9000;
464	}
465	printf(" 64-bit average error per %d days: %fms\n", days, mean);
466
467	for (i = 1000, mean = 0.0; i <= 10000; i++) {
468	error = test_clock(CLOCK_64_2STAGE, i, 1, 1, 0, 0);
469	errors[i] = error > 0 ? error : -1LL * error;
470	mean += (double) errors[i] / 9000;
471	}
472	printf(" 64-bit two-stage average error per %d days: %fms\n", days, mean);
473
474	for (i = 1000, mean = 0.0; i <= 10000; i++) {
475	error = test_clock(CLOCK_128_MULT_SHIFT, i, 1, 1, 0, 0);
476	errors[i] = error > 0 ? error : -1LL * error;
477	mean += (double) errors[i] / 9000;
478	}
479	printf(" 128-bit average error per %d days: %fms\n", days, mean);
480	*/
481	test_clock(CLOCK_64_2STAGE, 1000, 1, 0, 1, 1);
482	test_clock(CLOCK_64_2STAGE, 1100, 1, 0, 11, 10);
483	test_clock(CLOCK_64_2STAGE, 3000, 1, 0, 3, 1);
484	test_clock(CLOCK_64_2STAGE, 3333, 1, 0, 3333, 1000);
485	test_clock(CLOCK_64_2STAGE, 3392, 1, 0, 424, 125);
486	test_clock(CLOCK_64_2STAGE, 4500, 1, 0, 9, 2);
487	test_clock(CLOCK_64_2STAGE, 5000, 1, 0, 5, 1);
488
489	free(nsecs);
490	return 0;
491	}