From: Jens Axboe Date: Sat, 25 Sep 2021 20:25:05 +0000 (-0600) Subject: t/io_uring: add support for latency tracking X-Git-Tag: fio-3.29~94 X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=932131c944b10f2a03f4028318c454c98eca489f;p=fio.git t/io_uring: add support for latency tracking This will display the latency percentiles for the run when done, per submitter thread. It takes two arguments: -t Enable latency tracking if x is non-zero -T Set TSC clock rate to Y Hz The tsc rate can be programatically deduced (fio does this), for now pass it in. dmesg will generally tell you: tsc: Refined TSC clocksource calibration: 3699.889 MHz and you'd then do: -t1 -T3699889000 for that. Here's an example, synchronous optane gen2 read: [...] IOPS=254118, BW=124MiB/s, IOS/call=0/0, inflight=(1) IOPS=255024, BW=124MiB/s, IOS/call=0/0, inflight=(1) IOPS=255100, BW=124MiB/s, IOS/call=0/0, inflight=(1) IOPS=254791, BW=124MiB/s, IOS/call=0/0, inflight=(1) ^CExiting on signal 2 IOPS=100086, BW=48MiB/s, IOS/call=1/1, inflight=(1) 515102: Latency percentiles: percentiles (nsec): | 1.0000th=[ 3857], 5.0000th=[ 3857], 10.0000th=[ 3857], | 20.0000th=[ 3857], 30.0000th=[ 3857], 40.0000th=[ 3892], | 50.0000th=[ 3892], 60.0000th=[ 3892], 70.0000th=[ 3892], | 80.0000th=[ 3892], 90.0000th=[ 3961], 95.0000th=[ 3961], | 99.9000th=[ 8752], 99.5000th=[ 8752], 99.9000th=[ 8752], | 99.9500th=[ 9064], 99.9900th=[ 9755] Or a higher depth run: IOPS=3549568, BW=1733MiB/s, IOS/call=32/32, inflight=(64) IOPS=3547712, BW=1732MiB/s, IOS/call=32/31, inflight=(111) IOPS=3549504, BW=1733MiB/s, IOS/call=32/31, inflight=(128) ^CExiting on signal 2 IOPS=1413600, BW=690MiB/s, IOS/call=32/32, inflight=(35) 515078: Latency percentiles: percentiles (nsec): | 1.0000th=[13630], 5.0000th=[14322], 10.0000th=[15291], | 20.0000th=[16121], 30.0000th=[20065], 40.0000th=[21726], | 50.0000th=[22279], 60.0000th=[26154], 70.0000th=[27814], | 80.0000th=[28368], 90.0000th=[33903], 95.0000th=[34180], | 99.9000th=[52862], 99.5000th=[52862], 99.9000th=[52862], | 99.9500th=[56183], 99.9900th=[67807] Note that latency tracking isn't cheap, even if we tried to do it in the cheapest way possible. The peak workload shown here will run at ~3.7M IOPS without tracking, and as shown about 3.55M with tracking enabled. Signed-off-by: Jens Axboe --- diff --git a/t/io_uring.c b/t/io_uring.c index d5636380..903e7cf8 100644 --- a/t/io_uring.c +++ b/t/io_uring.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -22,10 +23,10 @@ #include "../arch/arch.h" #include "../lib/types.h" +#include "../lib/roundup.h" +#include "../minmax.h" #include "../os/linux/io_uring.h" -#define min(a, b) ((a < b) ? (a) : (b)) - struct io_sq_ring { unsigned *head; unsigned *tail; @@ -57,8 +58,14 @@ struct file { unsigned pending_ios; int real_fd; int fixed_fd; + int fileno; }; +#define PLAT_BITS 6 +#define PLAT_VAL (1 << PLAT_BITS) +#define PLAT_GROUP_NR 29 +#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL) + struct submitter { pthread_t thread; int ring_fd; @@ -67,6 +74,7 @@ struct submitter { struct io_uring_sqe *sqes; struct io_cq_ring cq_ring; int inflight; + int tid; unsigned long reaps; unsigned long done; unsigned long calls; @@ -74,6 +82,10 @@ struct submitter { __s32 *fds; + unsigned long *clock_batch; + int clock_index; + unsigned long *plat; + struct file files[MAX_FDS]; unsigned nr_files; unsigned cur_file; @@ -95,9 +107,202 @@ static int sq_thread_poll = 0; /* use kernel submission/poller thread */ static int sq_thread_cpu = -1; /* pin above thread to this CPU */ static int do_nop = 0; /* no-op SQ ring commands */ static int nthreads = 1; +static int stats = 0; /* generate IO stats */ +static unsigned long tsc_rate; static int vectored = 1; +static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, + 80.0, 90.0, 95.0, 99.9, 99.5, 99.9, 99.95, 99.99 }; +static int plist_len = 17; + +static unsigned long cycles_to_nsec(unsigned long cycles) +{ + uint64_t val; + + if (!tsc_rate) + return cycles; + + val = cycles * 1000000000ULL; + return val / tsc_rate; +} + +static unsigned long plat_idx_to_val(unsigned int idx) +{ + unsigned int error_bits; + unsigned long k, base; + + assert(idx < PLAT_NR); + + /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use + * all bits of the sample as index */ + if (idx < (PLAT_VAL << 1)) + return cycles_to_nsec(idx); + + /* Find the group and compute the minimum value of that group */ + error_bits = (idx >> PLAT_BITS) - 1; + base = ((unsigned long) 1) << (error_bits + PLAT_BITS); + + /* Find its bucket number of the group */ + k = idx % PLAT_VAL; + + /* Return the mean of the range of the bucket */ + return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits))); +} + +unsigned int calc_clat_percentiles(unsigned long *io_u_plat, unsigned long nr, + unsigned long **output, + unsigned long *maxv, unsigned long *minv) +{ + unsigned long sum = 0; + unsigned int len = plist_len, i, j = 0; + unsigned long *ovals = NULL; + bool is_last; + + *minv = -1ULL; + *maxv = 0; + + ovals = malloc(len * sizeof(*ovals)); + if (!ovals) + return 0; + + /* + * Calculate bucket values, note down max and min values + */ + is_last = false; + for (i = 0; i < PLAT_NR && !is_last; i++) { + sum += io_u_plat[i]; + while (sum >= ((long double) plist[j] / 100.0 * nr)) { + assert(plist[j] <= 100.0); + + ovals[j] = plat_idx_to_val(i); + if (ovals[j] < *minv) + *minv = ovals[j]; + if (ovals[j] > *maxv) + *maxv = ovals[j]; + + is_last = (j == len - 1) != 0; + if (is_last) + break; + + j++; + } + } + + if (!is_last) + fprintf(stderr, "error calculating latency percentiles\n"); + + *output = ovals; + return len; +} + +static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr, + unsigned int precision) +{ + unsigned int divisor, len, i, j = 0; + unsigned long minv, maxv; + unsigned long *ovals; + int per_line, scale_down, time_width; + bool is_last; + char fmt[32]; + + len = calc_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv); + if (!len || !ovals) + goto out; + + if (!tsc_rate) { + scale_down = 0; + divisor = 1; + printf(" percentiles (tsc ticks):\n |"); + } else if (minv > 2000 && maxv > 99999) { + scale_down = 1; + divisor = 1000; + printf(" percentiles (usec):\n |"); + } else { + scale_down = 0; + divisor = 1; + printf(" percentiles (nsec):\n |"); + } + + time_width = max(5, (int) (log10(maxv / divisor) + 1)); + snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3, + precision, time_width); + /* fmt will be something like " %5.2fth=[%4llu]%c" */ + per_line = (80 - 7) / (precision + 10 + time_width); + + for (j = 0; j < len; j++) { + /* for formatting */ + if (j != 0 && (j % per_line) == 0) + printf(" |"); + + /* end of the list */ + is_last = (j == len - 1) != 0; + + for (i = 0; i < scale_down; i++) + ovals[j] = (ovals[j] + 999) / 1000; + + printf(fmt, plist[j], ovals[j], is_last ? '\n' : ','); + + if (is_last) + break; + + if ((j % per_line) == per_line - 1) /* for formatting */ + printf("\n"); + } + +out: + free(ovals); +} + +static unsigned int plat_val_to_idx(unsigned long val) +{ + unsigned int msb, error_bits, base, offset, idx; + + /* Find MSB starting from bit 0 */ + if (val == 0) + msb = 0; + else + msb = (sizeof(val)*8) - __builtin_clzll(val) - 1; + + /* + * MSB <= (PLAT_BITS-1), cannot be rounded off. Use + * all bits of the sample as index + */ + if (msb <= PLAT_BITS) + return val; + + /* Compute the number of error bits to discard*/ + error_bits = msb - PLAT_BITS; + + /* Compute the number of buckets before the group */ + base = (error_bits + 1) << PLAT_BITS; + + /* + * Discard the error bits and apply the mask to find the + * index for the buckets in the group + */ + offset = (PLAT_VAL - 1) & (val >> error_bits); + + /* Make sure the index does not exceed (array size - 1) */ + idx = (base + offset) < (PLAT_NR - 1) ? + (base + offset) : (PLAT_NR - 1); + + return idx; +} + +static void add_stat(struct submitter *s, int clock_index, int nr) +{ +#ifdef ARCH_HAVE_CPU_CLOCK + unsigned long cycles; + unsigned int pidx; + + cycles = get_cpu_clock(); + cycles -= s->clock_batch[clock_index]; + pidx = plat_val_to_idx(cycles); + s->plat[pidx] += nr; +#endif +} + static int io_uring_register_buffers(struct submitter *s) { if (do_nop) @@ -224,7 +429,9 @@ static void init_io(struct submitter *s, unsigned index) } sqe->ioprio = 0; sqe->off = offset; - sqe->user_data = (unsigned long) f; + sqe->user_data = (unsigned long) f->fileno; + if (stats) + sqe->user_data |= ((unsigned long)s->clock_index << 32); } static int prep_more_ios(struct submitter *s, int max_ios) @@ -287,7 +494,9 @@ static int reap_events(struct submitter *s) break; cqe = &ring->cqes[head & cq_ring_mask]; if (!do_nop) { - f = (struct file *) (uintptr_t) cqe->user_data; + int fileno = cqe->user_data & 0xffffffff; + + f = &s->files[fileno]; f->pending_ios--; if (cqe->res != bs) { printf("io: unexpected ret=%d\n", cqe->res); @@ -296,6 +505,11 @@ static int reap_events(struct submitter *s) return -1; } } + if (stats) { + int clock_index = cqe->user_data >> 32; + + add_stat(s, clock_index, 1); + } reaped++; head++; } while (1); @@ -311,12 +525,28 @@ static void *submitter_fn(void *data) { struct submitter *s = data; struct io_sq_ring *ring = &s->sq_ring; - int ret, prepped; + int i, ret, prepped, nr_batch; - printf("submitter=%d\n", gettid()); + s->tid = gettid(); + printf("submitter=%d\n", s->tid); srand48(pthread_self()); + for (i = 0; i < MAX_FDS; i++) + s->files[i].fileno = i; + + if (stats) { + nr_batch = roundup_pow2(depth / batch_submit); + s->clock_batch = calloc(nr_batch, sizeof(unsigned long)); + s->clock_index = 0; + + s->plat = calloc(PLAT_NR, sizeof(unsigned long)); + } else { + s->clock_batch = NULL; + s->plat = NULL; + nr_batch = 0; + } + prepped = 0; do { int to_wait, to_submit, this_reap, to_prep; @@ -325,6 +555,12 @@ static void *submitter_fn(void *data) if (!prepped && s->inflight < depth) { to_prep = min(depth - s->inflight, batch_submit); prepped = prep_more_ios(s, to_prep); +#ifdef ARCH_HAVE_CPU_CLOCK + if (prepped && stats) { + s->clock_batch[s->clock_index] = get_cpu_clock(); + s->clock_index = (s->clock_index + 1) & (nr_batch - 1); + } +#endif } s->inflight += prepped; submit_more: @@ -555,9 +791,11 @@ static void usage(char *argv, int status) " -F : Register files, default %d\n" " -n : Number of threads, default %d\n" " -O : Use O_DIRECT, default %d\n" - " -N : Perform just no-op requests, default %d\n", + " -N : Perform just no-op requests, default %d" + " -t : Track IO latencies, default %d" + " -T : TSC rate in HZ\n", argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled, - fixedbufs, register_files, nthreads, !buffered, do_nop); + fixedbufs, register_files, nthreads, !buffered, do_nop, stats); exit(status); } @@ -573,16 +811,20 @@ int main(int argc, char *argv[]) if (!do_nop && argc < 2) usage(argv[0], 1); - while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:h?")) != -1) { + while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:h?")) != -1) { switch (opt) { case 'd': depth = atoi(optarg); break; case 's': batch_submit = atoi(optarg); + if (!batch_submit) + batch_submit = 1; break; case 'c': batch_complete = atoi(optarg); + if (!batch_complete) + batch_complete = 1; break; case 'b': bs = atoi(optarg); @@ -609,6 +851,20 @@ int main(int argc, char *argv[]) case 'O': buffered = !atoi(optarg); break; + case 't': +#ifndef ARCH_HAVE_CPU_CLOCK + fprintf(stderr, "Stats not supported on this CPU\n"); + return 1; +#endif + stats = !!atoi(optarg); + break; + case 'T': +#ifndef ARCH_HAVE_CPU_CLOCK + fprintf(stderr, "Stats not supported on this CPU\n"); + return 1; +#endif + tsc_rate = strtoul(optarg, NULL, 10); + break; case 'h': case '?': default: @@ -764,7 +1020,19 @@ int main(int argc, char *argv[]) s = get_submitter(j); pthread_join(s->thread, &ret); close(s->ring_fd); + + if (stats) { + unsigned long nr; + + printf("%d: Latency percentiles:\n", s->tid); + for (i = 0, nr = 0; i < PLAT_NR; i++) + nr += s->plat[i]; + show_clat_percentiles(s->plat, nr, 4); + free(s->clock_batch); + free(s->plat); + } } + free(fdepths); free(submitter); return 0;