#include <stddef.h>
#include <signal.h>
#include <inttypes.h>
+#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "../arch/arch.h"
#include "../lib/types.h"
+#include "../lib/roundup.h"
+#include "../minmax.h"
#include "../os/linux/io_uring.h"
-#define min(a, b) ((a < b) ? (a) : (b))
-
struct io_sq_ring {
unsigned *head;
unsigned *tail;
unsigned pending_ios;
int real_fd;
int fixed_fd;
+ int fileno;
};
+#define PLAT_BITS 6
+#define PLAT_VAL (1 << PLAT_BITS)
+#define PLAT_GROUP_NR 29
+#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL)
+
struct submitter {
pthread_t thread;
int ring_fd;
struct io_uring_sqe *sqes;
struct io_cq_ring cq_ring;
int inflight;
+ int tid;
unsigned long reaps;
unsigned long done;
unsigned long calls;
__s32 *fds;
+ unsigned long *clock_batch;
+ int clock_index;
+ unsigned long *plat;
+
struct file files[MAX_FDS];
unsigned nr_files;
unsigned cur_file;
static int sq_thread_cpu = -1; /* pin above thread to this CPU */
static int do_nop = 0; /* no-op SQ ring commands */
static int nthreads = 1;
+static int stats = 0; /* generate IO stats */
+static unsigned long tsc_rate;
static int vectored = 1;
+static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0,
+ 80.0, 90.0, 95.0, 99.9, 99.5, 99.9, 99.95, 99.99 };
+static int plist_len = 17;
+
+static unsigned long cycles_to_nsec(unsigned long cycles)
+{
+ uint64_t val;
+
+ if (!tsc_rate)
+ return cycles;
+
+ val = cycles * 1000000000ULL;
+ return val / tsc_rate;
+}
+
+static unsigned long plat_idx_to_val(unsigned int idx)
+{
+ unsigned int error_bits;
+ unsigned long k, base;
+
+ assert(idx < PLAT_NR);
+
+ /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+ * all bits of the sample as index */
+ if (idx < (PLAT_VAL << 1))
+ return cycles_to_nsec(idx);
+
+ /* Find the group and compute the minimum value of that group */
+ error_bits = (idx >> PLAT_BITS) - 1;
+ base = ((unsigned long) 1) << (error_bits + PLAT_BITS);
+
+ /* Find its bucket number of the group */
+ k = idx % PLAT_VAL;
+
+ /* Return the mean of the range of the bucket */
+ return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits)));
+}
+
+unsigned int calc_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
+ unsigned long **output,
+ unsigned long *maxv, unsigned long *minv)
+{
+ unsigned long sum = 0;
+ unsigned int len = plist_len, i, j = 0;
+ unsigned long *ovals = NULL;
+ bool is_last;
+
+ *minv = -1ULL;
+ *maxv = 0;
+
+ ovals = malloc(len * sizeof(*ovals));
+ if (!ovals)
+ return 0;
+
+ /*
+ * Calculate bucket values, note down max and min values
+ */
+ is_last = false;
+ for (i = 0; i < PLAT_NR && !is_last; i++) {
+ sum += io_u_plat[i];
+ while (sum >= ((long double) plist[j] / 100.0 * nr)) {
+ assert(plist[j] <= 100.0);
+
+ ovals[j] = plat_idx_to_val(i);
+ if (ovals[j] < *minv)
+ *minv = ovals[j];
+ if (ovals[j] > *maxv)
+ *maxv = ovals[j];
+
+ is_last = (j == len - 1) != 0;
+ if (is_last)
+ break;
+
+ j++;
+ }
+ }
+
+ if (!is_last)
+ fprintf(stderr, "error calculating latency percentiles\n");
+
+ *output = ovals;
+ return len;
+}
+
+static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
+ unsigned int precision)
+{
+ unsigned int divisor, len, i, j = 0;
+ unsigned long minv, maxv;
+ unsigned long *ovals;
+ int per_line, scale_down, time_width;
+ bool is_last;
+ char fmt[32];
+
+ len = calc_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv);
+ if (!len || !ovals)
+ goto out;
+
+ if (!tsc_rate) {
+ scale_down = 0;
+ divisor = 1;
+ printf(" percentiles (tsc ticks):\n |");
+ } else if (minv > 2000 && maxv > 99999) {
+ scale_down = 1;
+ divisor = 1000;
+ printf(" percentiles (usec):\n |");
+ } else {
+ scale_down = 0;
+ divisor = 1;
+ printf(" percentiles (nsec):\n |");
+ }
+
+ time_width = max(5, (int) (log10(maxv / divisor) + 1));
+ snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
+ precision, time_width);
+ /* fmt will be something like " %5.2fth=[%4llu]%c" */
+ per_line = (80 - 7) / (precision + 10 + time_width);
+
+ for (j = 0; j < len; j++) {
+ /* for formatting */
+ if (j != 0 && (j % per_line) == 0)
+ printf(" |");
+
+ /* end of the list */
+ is_last = (j == len - 1) != 0;
+
+ for (i = 0; i < scale_down; i++)
+ ovals[j] = (ovals[j] + 999) / 1000;
+
+ printf(fmt, plist[j], ovals[j], is_last ? '\n' : ',');
+
+ if (is_last)
+ break;
+
+ if ((j % per_line) == per_line - 1) /* for formatting */
+ printf("\n");
+ }
+
+out:
+ free(ovals);
+}
+
+static unsigned int plat_val_to_idx(unsigned long val)
+{
+ unsigned int msb, error_bits, base, offset, idx;
+
+ /* Find MSB starting from bit 0 */
+ if (val == 0)
+ msb = 0;
+ else
+ msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
+
+ /*
+ * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+ * all bits of the sample as index
+ */
+ if (msb <= PLAT_BITS)
+ return val;
+
+ /* Compute the number of error bits to discard*/
+ error_bits = msb - PLAT_BITS;
+
+ /* Compute the number of buckets before the group */
+ base = (error_bits + 1) << PLAT_BITS;
+
+ /*
+ * Discard the error bits and apply the mask to find the
+ * index for the buckets in the group
+ */
+ offset = (PLAT_VAL - 1) & (val >> error_bits);
+
+ /* Make sure the index does not exceed (array size - 1) */
+ idx = (base + offset) < (PLAT_NR - 1) ?
+ (base + offset) : (PLAT_NR - 1);
+
+ return idx;
+}
+
+static void add_stat(struct submitter *s, int clock_index, int nr)
+{
+#ifdef ARCH_HAVE_CPU_CLOCK
+ unsigned long cycles;
+ unsigned int pidx;
+
+ cycles = get_cpu_clock();
+ cycles -= s->clock_batch[clock_index];
+ pidx = plat_val_to_idx(cycles);
+ s->plat[pidx] += nr;
+#endif
+}
+
static int io_uring_register_buffers(struct submitter *s)
{
if (do_nop)
}
sqe->ioprio = 0;
sqe->off = offset;
- sqe->user_data = (unsigned long) f;
+ sqe->user_data = (unsigned long) f->fileno;
+ if (stats)
+ sqe->user_data |= ((unsigned long)s->clock_index << 32);
}
static int prep_more_ios(struct submitter *s, int max_ios)
break;
cqe = &ring->cqes[head & cq_ring_mask];
if (!do_nop) {
- f = (struct file *) (uintptr_t) cqe->user_data;
+ int fileno = cqe->user_data & 0xffffffff;
+
+ f = &s->files[fileno];
f->pending_ios--;
if (cqe->res != bs) {
printf("io: unexpected ret=%d\n", cqe->res);
return -1;
}
}
+ if (stats) {
+ int clock_index = cqe->user_data >> 32;
+
+ add_stat(s, clock_index, 1);
+ }
reaped++;
head++;
} while (1);
{
struct submitter *s = data;
struct io_sq_ring *ring = &s->sq_ring;
- int ret, prepped;
+ int i, ret, prepped, nr_batch;
- printf("submitter=%d\n", gettid());
+ s->tid = gettid();
+ printf("submitter=%d\n", s->tid);
srand48(pthread_self());
+ for (i = 0; i < MAX_FDS; i++)
+ s->files[i].fileno = i;
+
+ if (stats) {
+ nr_batch = roundup_pow2(depth / batch_submit);
+ s->clock_batch = calloc(nr_batch, sizeof(unsigned long));
+ s->clock_index = 0;
+
+ s->plat = calloc(PLAT_NR, sizeof(unsigned long));
+ } else {
+ s->clock_batch = NULL;
+ s->plat = NULL;
+ nr_batch = 0;
+ }
+
prepped = 0;
do {
int to_wait, to_submit, this_reap, to_prep;
if (!prepped && s->inflight < depth) {
to_prep = min(depth - s->inflight, batch_submit);
prepped = prep_more_ios(s, to_prep);
+#ifdef ARCH_HAVE_CPU_CLOCK
+ if (prepped && stats) {
+ s->clock_batch[s->clock_index] = get_cpu_clock();
+ s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
+ }
+#endif
}
s->inflight += prepped;
submit_more:
" -F <bool> : Register files, default %d\n"
" -n <int> : Number of threads, default %d\n"
" -O <bool> : Use O_DIRECT, default %d\n"
- " -N <bool> : Perform just no-op requests, default %d\n",
+ " -N <bool> : Perform just no-op requests, default %d"
+ " -t <bool> : Track IO latencies, default %d"
+ " -T <int> : TSC rate in HZ\n",
argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
- fixedbufs, register_files, nthreads, !buffered, do_nop);
+ fixedbufs, register_files, nthreads, !buffered, do_nop, stats);
exit(status);
}
if (!do_nop && argc < 2)
usage(argv[0], 1);
- while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:h?")) != -1) {
+ while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:h?")) != -1) {
switch (opt) {
case 'd':
depth = atoi(optarg);
break;
case 's':
batch_submit = atoi(optarg);
+ if (!batch_submit)
+ batch_submit = 1;
break;
case 'c':
batch_complete = atoi(optarg);
+ if (!batch_complete)
+ batch_complete = 1;
break;
case 'b':
bs = atoi(optarg);
case 'O':
buffered = !atoi(optarg);
break;
+ case 't':
+#ifndef ARCH_HAVE_CPU_CLOCK
+ fprintf(stderr, "Stats not supported on this CPU\n");
+ return 1;
+#endif
+ stats = !!atoi(optarg);
+ break;
+ case 'T':
+#ifndef ARCH_HAVE_CPU_CLOCK
+ fprintf(stderr, "Stats not supported on this CPU\n");
+ return 1;
+#endif
+ tsc_rate = strtoul(optarg, NULL, 10);
+ break;
case 'h':
case '?':
default:
s = get_submitter(j);
pthread_join(s->thread, &ret);
close(s->ring_fd);
+
+ if (stats) {
+ unsigned long nr;
+
+ printf("%d: Latency percentiles:\n", s->tid);
+ for (i = 0, nr = 0; i < PLAT_NR; i++)
+ nr += s->plat[i];
+ show_clat_percentiles(s->plat, nr, 4);
+ free(s->clock_batch);
+ free(s->plat);
+ }
}
+
free(fdepths);
free(submitter);
return 0;