t/io_uring: don't append 'K' to IOPS if we don't divide by 1000
[fio.git] / t / io_uring.c
index e5568aa2e5a8f8420e714bfb0752c04feb762497..1b729ebf95443f0922a327165420f10bb5c93515 100644 (file)
@@ -7,6 +7,10 @@
 #include <inttypes.h>
 #include <math.h>
 
+#ifdef CONFIG_LIBAIO
+#include <libaio.h>
+#endif
+
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
@@ -86,6 +90,10 @@ struct submitter {
        int clock_index;
        unsigned long *plat;
 
+#ifdef CONFIG_LIBAIO
+       io_context_t aio_ctx;
+#endif
+
        struct file files[MAX_FDS];
        unsigned nr_files;
        unsigned cur_file;
@@ -94,6 +102,7 @@ struct submitter {
 
 static struct submitter *submitter;
 static volatile int finish;
+static int stats_running;
 
 static int depth = DEPTH;
 static int batch_submit = BATCH_SUBMIT;
@@ -101,6 +110,7 @@ static int batch_complete = BATCH_COMPLETE;
 static int bs = BS;
 static int polled = 1;         /* use IO polling */
 static int fixedbufs = 1;      /* use fixed user buffers */
+static int dma_map;            /* pre-map DMA buffers */
 static int register_files = 1; /* use fixed files */
 static int buffered = 0;       /* use buffered IO, not O_DIRECT */
 static int sq_thread_poll = 0; /* use kernel submission/poller thread */
@@ -108,14 +118,30 @@ static int sq_thread_cpu = -1;    /* pin above thread to this CPU */
 static int do_nop = 0;         /* no-op SQ ring commands */
 static int nthreads = 1;
 static int stats = 0;          /* generate IO stats */
+static int aio = 0;            /* use libaio */
+static int runtime = 0;        /* runtime */
+
 static unsigned long tsc_rate;
 
+#define TSC_RATE_FILE  "tsc-rate"
+
 static int vectored = 1;
 
 static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0,
-                       80.0, 90.0, 95.0, 99.9, 99.5, 99.9, 99.95, 99.99 };
+                       80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 };
 static int plist_len = 17;
 
+#ifndef IORING_REGISTER_MAP_BUFFERS
+#define IORING_REGISTER_MAP_BUFFERS    20
+struct io_uring_map_buffers {
+       __s32   fd;
+       __u32   buf_start;
+       __u32   buf_end;
+       __u32   flags;
+       __u64   rsvd[2];
+};
+#endif
+
 static unsigned long cycles_to_nsec(unsigned long cycles)
 {
        uint64_t val;
@@ -296,13 +322,33 @@ static void add_stat(struct submitter *s, int clock_index, int nr)
        unsigned long cycles;
        unsigned int pidx;
 
-       cycles = get_cpu_clock();
-       cycles -= s->clock_batch[clock_index];
-       pidx = plat_val_to_idx(cycles);
-       s->plat[pidx] += nr;
+       if (!s->finish && clock_index) {
+               cycles = get_cpu_clock();
+               cycles -= s->clock_batch[clock_index];
+               pidx = plat_val_to_idx(cycles);
+               s->plat[pidx] += nr;
+       }
 #endif
 }
 
+static int io_uring_map_buffers(struct submitter *s)
+{
+       struct io_uring_map_buffers map = {
+               .fd             = s->files[0].real_fd,
+               .buf_end        = depth,
+       };
+
+       if (do_nop)
+               return 0;
+       if (s->nr_files > 1) {
+               fprintf(stderr, "Can't map buffers with multiple files\n");
+               return -1;
+       }
+
+       return syscall(__NR_io_uring_register, s->ring_fd,
+                       IORING_REGISTER_MAP_BUFFERS, &map, 1);
+}
+
 static int io_uring_register_buffers(struct submitter *s)
 {
        if (do_nop)
@@ -430,11 +476,11 @@ static void init_io(struct submitter *s, unsigned index)
        sqe->ioprio = 0;
        sqe->off = offset;
        sqe->user_data = (unsigned long) f->fileno;
-       if (stats)
+       if (stats && stats_running)
                sqe->user_data |= ((unsigned long)s->clock_index << 32);
 }
 
-static int prep_more_ios(struct submitter *s, int max_ios)
+static int prep_more_ios_uring(struct submitter *s, int max_ios)
 {
        struct io_sq_ring *ring = &s->sq_ring;
        unsigned index, tail, next_tail, prepped = 0;
@@ -479,7 +525,7 @@ static int get_file_size(struct file *f)
        return -1;
 }
 
-static int reap_events(struct submitter *s)
+static int reap_events_uring(struct submitter *s)
 {
        struct io_cq_ring *ring = &s->cq_ring;
        struct io_uring_cqe *cqe;
@@ -517,7 +563,6 @@ static int reap_events(struct submitter *s)
                                last_idx = clock_index;
                        }
                        stat_nr++;
-                       add_stat(s, clock_index, 1);
                }
                reaped++;
                head++;
@@ -533,14 +578,12 @@ static int reap_events(struct submitter *s)
        return reaped;
 }
 
-static void *submitter_fn(void *data)
+static int submitter_init(struct submitter *s)
 {
-       struct submitter *s = data;
-       struct io_sq_ring *ring = &s->sq_ring;
-       int i, ret, prepped, nr_batch;
+       int i, nr_batch;
 
        s->tid = gettid();
-       printf("submitter=%d\n", s->tid);
+       printf("submitter=%d, tid=%d\n", s->index, s->tid);
 
        srand48(pthread_self());
 
@@ -549,8 +592,10 @@ static void *submitter_fn(void *data)
 
        if (stats) {
                nr_batch = roundup_pow2(depth / batch_submit);
+               if (nr_batch < 2)
+                       nr_batch = 2;
                s->clock_batch = calloc(nr_batch, sizeof(unsigned long));
-               s->clock_index = 0;
+               s->clock_index = 1;
 
                s->plat = calloc(PLAT_NR, sizeof(unsigned long));
        } else {
@@ -559,6 +604,170 @@ static void *submitter_fn(void *data)
                nr_batch = 0;
        }
 
+       return nr_batch;
+}
+
+#ifdef CONFIG_LIBAIO
+static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
+{
+       unsigned long offset, data;
+       struct file *f;
+       unsigned index;
+       long r;
+
+       index = 0;
+       while (index < max_ios) {
+               struct iocb *iocb = &iocbs[index];
+
+               if (s->nr_files == 1) {
+                       f = &s->files[0];
+               } else {
+                       f = &s->files[s->cur_file];
+                       if (f->pending_ios >= file_depth(s)) {
+                               s->cur_file++;
+                               if (s->cur_file == s->nr_files)
+                                       s->cur_file = 0;
+                               f = &s->files[s->cur_file];
+                       }
+               }
+               f->pending_ios++;
+
+               r = lrand48();
+               offset = (r % (f->max_blocks - 1)) * bs;
+               io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base,
+                               s->iovecs[index].iov_len, offset);
+
+               data = f->fileno;
+               if (stats && stats_running)
+                       data |= ((unsigned long) s->clock_index << 32);
+               iocb->data = (void *) (uintptr_t) data;
+               index++;
+       }
+       return index;
+}
+
+static int reap_events_aio(struct submitter *s, struct io_event *events, int evs)
+{
+       int last_idx = -1, stat_nr = 0;
+       int reaped = 0;
+
+       while (evs) {
+               unsigned long data = (uintptr_t) events[reaped].data;
+               struct file *f = &s->files[data & 0xffffffff];
+
+               f->pending_ios--;
+               if (events[reaped].res != bs) {
+                       printf("io: unexpected ret=%ld\n", events[reaped].res);
+                       return -1;
+               }
+               if (stats) {
+                       int clock_index = data >> 32;
+
+                       if (last_idx != clock_index) {
+                               if (last_idx != -1) {
+                                       add_stat(s, last_idx, stat_nr);
+                                       stat_nr = 0;
+                               }
+                               last_idx = clock_index;
+                       }
+                       stat_nr++;
+               }
+               reaped++;
+               evs--;
+       }
+
+       if (stat_nr)
+               add_stat(s, last_idx, stat_nr);
+
+       s->inflight -= reaped;
+       s->done += reaped;
+       return reaped;
+}
+
+static void *submitter_aio_fn(void *data)
+{
+       struct submitter *s = data;
+       int i, ret, prepped, nr_batch;
+       struct iocb **iocbsptr;
+       struct iocb *iocbs;
+       struct io_event *events;
+
+       nr_batch = submitter_init(s);
+
+       iocbsptr = calloc(depth, sizeof(struct iocb *));
+       iocbs = calloc(depth, sizeof(struct iocb));
+       events = calloc(depth, sizeof(struct io_event));
+
+       for (i = 0; i < depth; i++)
+               iocbsptr[i] = &iocbs[i];
+
+       prepped = 0;
+       do {
+               int to_wait, to_submit, to_prep;
+
+               if (!prepped && s->inflight < depth) {
+                       to_prep = min(depth - s->inflight, batch_submit);
+                       prepped = prep_more_ios_aio(s, to_prep, iocbs);
+#ifdef ARCH_HAVE_CPU_CLOCK
+                       if (prepped && stats) {
+                               s->clock_batch[s->clock_index] = get_cpu_clock();
+                               s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
+                       }
+#endif
+               }
+               s->inflight += prepped;
+               to_submit = prepped;
+
+               if (to_submit && (s->inflight + to_submit <= depth))
+                       to_wait = 0;
+               else
+                       to_wait = min(s->inflight + to_submit, batch_complete);
+
+               ret = io_submit(s->aio_ctx, to_submit, iocbsptr);
+               s->calls++;
+               if (ret < 0) {
+                       perror("io_submit");
+                       break;
+               } else if (ret != to_submit) {
+                       printf("submitted %d, wanted %d\n", ret, to_submit);
+                       break;
+               }
+               prepped = 0;
+
+               while (to_wait) {
+                       int r;
+
+                       s->calls++;
+                       r = io_getevents(s->aio_ctx, to_wait, to_wait, events, NULL);
+                       if (r < 0) {
+                               perror("io_getevents");
+                               break;
+                       } else if (r != to_wait) {
+                               printf("r=%d, wait=%d\n", r, to_wait);
+                               break;
+                       }
+                       r = reap_events_aio(s, events, r);
+                       s->reaps += r;
+                       to_wait -= r;
+               }
+       } while (!s->finish);
+
+       free(iocbsptr);
+       free(iocbs);
+       free(events);
+       finish = 1;
+       return NULL;
+}
+#endif
+
+static void *submitter_uring_fn(void *data)
+{
+       struct submitter *s = data;
+       struct io_sq_ring *ring = &s->sq_ring;
+       int ret, prepped, nr_batch;
+
+       nr_batch = submitter_init(s);
+
        prepped = 0;
        do {
                int to_wait, to_submit, this_reap, to_prep;
@@ -566,7 +775,7 @@ static void *submitter_fn(void *data)
 
                if (!prepped && s->inflight < depth) {
                        to_prep = min(depth - s->inflight, batch_submit);
-                       prepped = prep_more_ios(s, to_prep);
+                       prepped = prep_more_ios_uring(s, to_prep);
 #ifdef ARCH_HAVE_CPU_CLOCK
                        if (prepped && stats) {
                                s->clock_batch[s->clock_index] = get_cpu_clock();
@@ -611,7 +820,8 @@ submit:
                this_reap = 0;
                do {
                        int r;
-                       r = reap_events(s);
+
+                       r = reap_events_uring(s);
                        if (r == -1) {
                                s->finish = 1;
                                break;
@@ -664,11 +874,10 @@ static struct submitter *get_submitter(int offset)
        return ret;
 }
 
-static void sig_int(int sig)
+static void do_finish(const char *reason)
 {
        int j;
-
-       printf("Exiting on signal %d\n", sig);
+       printf("Exiting on %s\n", reason);
        for (j = 0; j < nthreads; j++) {
                struct submitter *s = get_submitter(j);
                s->finish = 1;
@@ -676,6 +885,11 @@ static void sig_int(int sig)
        finish = 1;
 }
 
+static void sig_int(int sig)
+{
+       do_finish("signal");
+}
+
 static void arm_sig_int(void)
 {
        struct sigaction act;
@@ -691,6 +905,34 @@ static void arm_sig_int(void)
 #endif
 }
 
+static int setup_aio(struct submitter *s)
+{
+#ifdef CONFIG_LIBAIO
+       if (polled) {
+               fprintf(stderr, "aio does not support polled IO\n");
+               polled = 0;
+       }
+       if (sq_thread_poll) {
+               fprintf(stderr, "aio does not support SQPOLL IO\n");
+               sq_thread_poll = 0;
+       }
+       if (do_nop) {
+               fprintf(stderr, "aio does not support polled IO\n");
+               do_nop = 0;
+       }
+       if (fixedbufs || register_files) {
+               fprintf(stderr, "aio does not support registered files or buffers\n");
+               fixedbufs = register_files = 0;
+       }
+
+       return io_queue_init(depth, &s->aio_ctx);
+#else
+       fprintf(stderr, "Legacy AIO not available on this system/build\n");
+       errno = EINVAL;
+       return -1;
+#endif
+}
+
 static int setup_ring(struct submitter *s)
 {
        struct io_sq_ring *sring = &s->sq_ring;
@@ -733,6 +975,14 @@ static int setup_ring(struct submitter *s)
                        perror("io_uring_register_buffers");
                        return 1;
                }
+
+               if (dma_map) {
+                       ret = io_uring_map_buffers(s);
+                       if (ret < 0) {
+                               perror("io_uring_map_buffers");
+                               return 1;
+                       }
+               }
        }
 
        if (register_files) {
@@ -746,7 +996,6 @@ static int setup_ring(struct submitter *s)
        ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
                        PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
                        IORING_OFF_SQ_RING);
-       printf("sq_ring ptr = 0x%p\n", ptr);
        sring->head = ptr + p.sq_off.head;
        sring->tail = ptr + p.sq_off.tail;
        sring->ring_mask = ptr + p.sq_off.ring_mask;
@@ -758,12 +1007,10 @@ static int setup_ring(struct submitter *s)
        s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
                        PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
                        IORING_OFF_SQES);
-       printf("sqes ptr    = 0x%p\n", s->sqes);
 
        ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
                        PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
                        IORING_OFF_CQ_RING);
-       printf("cq_ring ptr = 0x%p\n", ptr);
        cring->head = ptr + p.cq_off.head;
        cring->tail = ptr + p.cq_off.tail;
        cring->ring_mask = ptr + p.cq_off.ring_mask;
@@ -798,6 +1045,8 @@ static void file_depths(char *buf)
 
 static void usage(char *argv, int status)
 {
+       char runtime_str[16];
+       snprintf(runtime_str, sizeof(runtime_str), "%d", runtime);
        printf("%s [options] -- [filenames]\n"
                " -d <int>  : IO Depth, default %d\n"
                " -s <int>  : Batch submit, default %d\n"
@@ -805,17 +1054,65 @@ static void usage(char *argv, int status)
                " -b <int>  : Block size, default %d\n"
                " -p <bool> : Polled IO, default %d\n"
                " -B <bool> : Fixed buffers, default %d\n"
+               " -R <bool> : DMA map fixed buffers, default %d\n"
                " -F <bool> : Register files, default %d\n"
                " -n <int>  : Number of threads, default %d\n"
                " -O <bool> : Use O_DIRECT, default %d\n"
                " -N <bool> : Perform just no-op requests, default %d\n"
                " -t <bool> : Track IO latencies, default %d\n"
-               " -T <int>  : TSC rate in HZ\n",
+               " -T <int>  : TSC rate in HZ\n"
+               " -a <bool> : Use legacy aio, default %d\n"
+               " -r <int>  : Runtime in seconds, default %s\n",
                argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
-               fixedbufs, register_files, nthreads, !buffered, do_nop, stats);
+               fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
+               stats, aio, runtime == 0 ? "unlimited" : runtime_str);
        exit(status);
 }
 
+static void read_tsc_rate(void)
+{
+       char buffer[32];
+       int fd, ret;
+
+       if (tsc_rate)
+               return;
+
+       fd = open(TSC_RATE_FILE, O_RDONLY);
+       if (fd < 0)
+               return;
+
+       ret = read(fd, buffer, sizeof(buffer));
+       if (ret < 0) {
+               close(fd);
+               return;
+       }
+
+       tsc_rate = strtoul(buffer, NULL, 10);
+       printf("Using TSC rate %luHz\n", tsc_rate);
+       close(fd);
+}
+
+static void write_tsc_rate(void)
+{
+       char buffer[32];
+       struct stat sb;
+       int fd, ret;
+
+       if (!stat(TSC_RATE_FILE, &sb))
+               return;
+
+       fd = open(TSC_RATE_FILE, O_WRONLY | O_CREAT, 0644);
+       if (fd < 0)
+               return;
+
+       memset(buffer, 0, sizeof(buffer));
+       sprintf(buffer, "%lu", tsc_rate);
+       ret = write(fd, buffer, strlen(buffer));
+       if (ret < 0)
+               perror("write");
+       close(fd);
+}
+
 int main(int argc, char *argv[])
 {
        struct submitter *s;
@@ -828,8 +1125,11 @@ int main(int argc, char *argv[])
        if (!do_nop && argc < 2)
                usage(argv[0], 1);
 
-       while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:h?")) != -1) {
+       while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:h?")) != -1) {
                switch (opt) {
+               case 'a':
+                       aio = !!atoi(optarg);
+                       break;
                case 'd':
                        depth = atoi(optarg);
                        break;
@@ -881,6 +1181,13 @@ int main(int argc, char *argv[])
                        return 1;
 #endif
                        tsc_rate = strtoul(optarg, NULL, 10);
+                       write_tsc_rate();
+                       break;
+               case 'r':
+                       runtime = atoi(optarg);
+                       break;
+               case 'D':
+                       dma_map = !!atoi(optarg);
                        break;
                case 'h':
                case '?':
@@ -890,10 +1197,15 @@ int main(int argc, char *argv[])
                }
        }
 
+       if (stats)
+               read_tsc_rate();
+
        if (batch_complete > depth)
                batch_complete = depth;
        if (batch_submit > depth)
                batch_submit = depth;
+       if (!fixedbufs && dma_map)
+               dma_map = 0;
 
        submitter = calloc(nthreads, sizeof(*submitter) +
                                depth * sizeof(struct iovec));
@@ -983,19 +1295,30 @@ int main(int argc, char *argv[])
        for (j = 0; j < nthreads; j++) {
                s = get_submitter(j);
 
-               err = setup_ring(s);
+               if (!aio)
+                       err = setup_ring(s);
+               else
+                       err = setup_aio(s);
                if (err) {
                        printf("ring setup failed: %s, %d\n", strerror(errno), err);
                        return 1;
                }
        }
        s = get_submitter(0);
-       printf("polled=%d, fixedbufs=%d, register_files=%d, buffered=%d", polled, fixedbufs, register_files, buffered);
-       printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", depth, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+       printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
+       if (!aio)
+               printf("Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+       else
+               printf("Engine=aio\n");
 
        for (j = 0; j < nthreads; j++) {
                s = get_submitter(j);
-               pthread_create(&s->thread, NULL, submitter_fn, s);
+               if (!aio)
+                       pthread_create(&s->thread, NULL, submitter_uring_fn, s);
+#ifdef CONFIG_LIBAIO
+               else
+                       pthread_create(&s->thread, NULL, submitter_aio_fn, s);
+#endif
        }
 
        fdepths = malloc(8 * s->nr_files * nthreads);
@@ -1008,6 +1331,17 @@ int main(int argc, char *argv[])
                unsigned long iops, bw;
 
                sleep(1);
+               if (runtime && !--runtime)
+                       do_finish("timeout");
+
+               /* don't print partial run, if interrupted by signal */
+               if (finish)
+                       break;
+
+               /* one second in to the run, enable stats */
+               if (stats)
+                       stats_running = 1;
+
                for (j = 0; j < nthreads; j++) {
                        this_done += s->done;
                        this_call += s->calls;
@@ -1024,7 +1358,10 @@ int main(int argc, char *argv[])
                        bw = iops * (bs / 1048576);
                else
                        bw = iops / (1048576 / bs);
-               printf("IOPS=%lu, ", iops);
+               if (iops > 100000)
+                       printf("IOPS=%luK, ", iops / 1000);
+               else
+                       printf("IOPS=%lu, ", iops);
                if (!do_nop)
                        printf("BW=%luMiB/s, ", bw);
                printf("IOS/call=%ld/%ld, inflight=(%s)\n", rpc, ipc, fdepths);