t/io_uring: use internal random generator
[fio.git] / t / io_uring.c
1 #include <stdio.h>
2 #include <errno.h>
3 #include <assert.h>
4 #include <stdlib.h>
5 #include <stddef.h>
6 #include <signal.h>
7 #include <inttypes.h>
8 #include <math.h>
9
10 #ifdef CONFIG_LIBAIO
11 #include <libaio.h>
12 #endif
13
14 #include <sys/types.h>
15 #include <sys/stat.h>
16 #include <sys/ioctl.h>
17 #include <sys/syscall.h>
18 #include <sys/resource.h>
19 #include <sys/mman.h>
20 #include <sys/uio.h>
21 #include <linux/fs.h>
22 #include <fcntl.h>
23 #include <unistd.h>
24 #include <string.h>
25 #include <pthread.h>
26 #include <sched.h>
27
28 #include "../arch/arch.h"
29 #include "../lib/types.h"
30 #include "../lib/roundup.h"
31 #include "../lib/rand.h"
32 #include "../minmax.h"
33 #include "../os/linux/io_uring.h"
34
35 struct io_sq_ring {
36         unsigned *head;
37         unsigned *tail;
38         unsigned *ring_mask;
39         unsigned *ring_entries;
40         unsigned *flags;
41         unsigned *array;
42 };
43
44 struct io_cq_ring {
45         unsigned *head;
46         unsigned *tail;
47         unsigned *ring_mask;
48         unsigned *ring_entries;
49         struct io_uring_cqe *cqes;
50 };
51
52 #define DEPTH                   128
53 #define BATCH_SUBMIT            32
54 #define BATCH_COMPLETE          32
55 #define BS                      4096
56
57 #define MAX_FDS                 16
58
59 static unsigned sq_ring_mask, cq_ring_mask;
60
61 struct file {
62         unsigned long max_blocks;
63         unsigned pending_ios;
64         int real_fd;
65         int fixed_fd;
66         int fileno;
67 };
68
69 #define PLAT_BITS               6
70 #define PLAT_VAL                (1 << PLAT_BITS)
71 #define PLAT_GROUP_NR           29
72 #define PLAT_NR                 (PLAT_GROUP_NR * PLAT_VAL)
73
74 struct submitter {
75         pthread_t thread;
76         int ring_fd;
77         int index;
78         struct io_sq_ring sq_ring;
79         struct io_uring_sqe *sqes;
80         struct io_cq_ring cq_ring;
81         int inflight;
82         int tid;
83         unsigned long reaps;
84         unsigned long done;
85         unsigned long calls;
86         volatile int finish;
87
88         __s32 *fds;
89
90         struct taus258_state rand_state;
91
92         unsigned long *clock_batch;
93         int clock_index;
94         unsigned long *plat;
95
96 #ifdef CONFIG_LIBAIO
97         io_context_t aio_ctx;
98 #endif
99
100         struct file files[MAX_FDS];
101         unsigned nr_files;
102         unsigned cur_file;
103         struct iovec iovecs[];
104 };
105
106 static struct submitter *submitter;
107 static volatile int finish;
108 static int stats_running;
109 static unsigned long max_iops;
110
111 static int depth = DEPTH;
112 static int batch_submit = BATCH_SUBMIT;
113 static int batch_complete = BATCH_COMPLETE;
114 static int bs = BS;
115 static int polled = 1;          /* use IO polling */
116 static int fixedbufs = 1;       /* use fixed user buffers */
117 static int dma_map;             /* pre-map DMA buffers */
118 static int register_files = 1;  /* use fixed files */
119 static int buffered = 0;        /* use buffered IO, not O_DIRECT */
120 static int sq_thread_poll = 0;  /* use kernel submission/poller thread */
121 static int sq_thread_cpu = -1;  /* pin above thread to this CPU */
122 static int do_nop = 0;          /* no-op SQ ring commands */
123 static int nthreads = 1;
124 static int stats = 0;           /* generate IO stats */
125 static int aio = 0;             /* use libaio */
126 static int runtime = 0; /* runtime */
127
128 static unsigned long tsc_rate;
129
130 #define TSC_RATE_FILE   "tsc-rate"
131
132 static int vectored = 1;
133
134 static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0,
135                         80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 };
136 static int plist_len = 17;
137
138 #ifndef IORING_REGISTER_MAP_BUFFERS
139 #define IORING_REGISTER_MAP_BUFFERS     20
140 struct io_uring_map_buffers {
141         __s32   fd;
142         __u32   buf_start;
143         __u32   buf_end;
144         __u32   flags;
145         __u64   rsvd[2];
146 };
147 #endif
148
149 static unsigned long cycles_to_nsec(unsigned long cycles)
150 {
151         uint64_t val;
152
153         if (!tsc_rate)
154                 return cycles;
155
156         val = cycles * 1000000000ULL;
157         return val / tsc_rate;
158 }
159
160 static unsigned long plat_idx_to_val(unsigned int idx)
161 {
162         unsigned int error_bits;
163         unsigned long k, base;
164
165         assert(idx < PLAT_NR);
166
167         /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
168          * all bits of the sample as index */
169         if (idx < (PLAT_VAL << 1))
170                 return cycles_to_nsec(idx);
171
172         /* Find the group and compute the minimum value of that group */
173         error_bits = (idx >> PLAT_BITS) - 1;
174         base = ((unsigned long) 1) << (error_bits + PLAT_BITS);
175
176         /* Find its bucket number of the group */
177         k = idx % PLAT_VAL;
178
179         /* Return the mean of the range of the bucket */
180         return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits)));
181 }
182
183 unsigned int calc_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
184                                    unsigned long **output,
185                                    unsigned long *maxv, unsigned long *minv)
186 {
187         unsigned long sum = 0;
188         unsigned int len = plist_len, i, j = 0;
189         unsigned long *ovals = NULL;
190         bool is_last;
191
192         *minv = -1ULL;
193         *maxv = 0;
194
195         ovals = malloc(len * sizeof(*ovals));
196         if (!ovals)
197                 return 0;
198
199         /*
200          * Calculate bucket values, note down max and min values
201          */
202         is_last = false;
203         for (i = 0; i < PLAT_NR && !is_last; i++) {
204                 sum += io_u_plat[i];
205                 while (sum >= ((long double) plist[j] / 100.0 * nr)) {
206                         assert(plist[j] <= 100.0);
207
208                         ovals[j] = plat_idx_to_val(i);
209                         if (ovals[j] < *minv)
210                                 *minv = ovals[j];
211                         if (ovals[j] > *maxv)
212                                 *maxv = ovals[j];
213
214                         is_last = (j == len - 1) != 0;
215                         if (is_last)
216                                 break;
217
218                         j++;
219                 }
220         }
221
222         if (!is_last)
223                 fprintf(stderr, "error calculating latency percentiles\n");
224
225         *output = ovals;
226         return len;
227 }
228
229 static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
230                                   unsigned int precision)
231 {
232         unsigned int divisor, len, i, j = 0;
233         unsigned long minv, maxv;
234         unsigned long *ovals;
235         int per_line, scale_down, time_width;
236         bool is_last;
237         char fmt[32];
238
239         len = calc_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv);
240         if (!len || !ovals)
241                 goto out;
242
243         if (!tsc_rate) {
244                 scale_down = 0;
245                 divisor = 1;
246                 printf("    percentiles (tsc ticks):\n     |");
247         } else if (minv > 2000 && maxv > 99999) {
248                 scale_down = 1;
249                 divisor = 1000;
250                 printf("    percentiles (usec):\n     |");
251         } else {
252                 scale_down = 0;
253                 divisor = 1;
254                 printf("    percentiles (nsec):\n     |");
255         }
256
257         time_width = max(5, (int) (log10(maxv / divisor) + 1));
258         snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
259                         precision, time_width);
260         /* fmt will be something like " %5.2fth=[%4llu]%c" */
261         per_line = (80 - 7) / (precision + 10 + time_width);
262
263         for (j = 0; j < len; j++) {
264                 /* for formatting */
265                 if (j != 0 && (j % per_line) == 0)
266                         printf("     |");
267
268                 /* end of the list */
269                 is_last = (j == len - 1) != 0;
270
271                 for (i = 0; i < scale_down; i++)
272                         ovals[j] = (ovals[j] + 999) / 1000;
273
274                 printf(fmt, plist[j], ovals[j], is_last ? '\n' : ',');
275
276                 if (is_last)
277                         break;
278
279                 if ((j % per_line) == per_line - 1)     /* for formatting */
280                         printf("\n");
281         }
282
283 out:
284         free(ovals);
285 }
286
287 static unsigned int plat_val_to_idx(unsigned long val)
288 {
289         unsigned int msb, error_bits, base, offset, idx;
290
291         /* Find MSB starting from bit 0 */
292         if (val == 0)
293                 msb = 0;
294         else
295                 msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
296
297         /*
298          * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
299          * all bits of the sample as index
300          */
301         if (msb <= PLAT_BITS)
302                 return val;
303
304         /* Compute the number of error bits to discard*/
305         error_bits = msb - PLAT_BITS;
306
307         /* Compute the number of buckets before the group */
308         base = (error_bits + 1) << PLAT_BITS;
309
310         /*
311          * Discard the error bits and apply the mask to find the
312          * index for the buckets in the group
313          */
314         offset = (PLAT_VAL - 1) & (val >> error_bits);
315
316         /* Make sure the index does not exceed (array size - 1) */
317         idx = (base + offset) < (PLAT_NR - 1) ?
318                 (base + offset) : (PLAT_NR - 1);
319
320         return idx;
321 }
322
323 static void add_stat(struct submitter *s, int clock_index, int nr)
324 {
325 #ifdef ARCH_HAVE_CPU_CLOCK
326         unsigned long cycles;
327         unsigned int pidx;
328
329         if (!s->finish && clock_index) {
330                 cycles = get_cpu_clock();
331                 cycles -= s->clock_batch[clock_index];
332                 pidx = plat_val_to_idx(cycles);
333                 s->plat[pidx] += nr;
334         }
335 #endif
336 }
337
338 static int io_uring_map_buffers(struct submitter *s)
339 {
340         struct io_uring_map_buffers map = {
341                 .fd             = s->files[0].real_fd,
342                 .buf_end        = depth,
343         };
344
345         if (do_nop)
346                 return 0;
347         if (s->nr_files > 1) {
348                 fprintf(stderr, "Can't map buffers with multiple files\n");
349                 return -1;
350         }
351
352         return syscall(__NR_io_uring_register, s->ring_fd,
353                         IORING_REGISTER_MAP_BUFFERS, &map, 1);
354 }
355
356 static int io_uring_register_buffers(struct submitter *s)
357 {
358         if (do_nop)
359                 return 0;
360
361         return syscall(__NR_io_uring_register, s->ring_fd,
362                         IORING_REGISTER_BUFFERS, s->iovecs, depth);
363 }
364
365 static int io_uring_register_files(struct submitter *s)
366 {
367         int i;
368
369         if (do_nop)
370                 return 0;
371
372         s->fds = calloc(s->nr_files, sizeof(__s32));
373         for (i = 0; i < s->nr_files; i++) {
374                 s->fds[i] = s->files[i].real_fd;
375                 s->files[i].fixed_fd = i;
376         }
377
378         return syscall(__NR_io_uring_register, s->ring_fd,
379                         IORING_REGISTER_FILES, s->fds, s->nr_files);
380 }
381
382 static int io_uring_setup(unsigned entries, struct io_uring_params *p)
383 {
384         return syscall(__NR_io_uring_setup, entries, p);
385 }
386
387 static void io_uring_probe(int fd)
388 {
389         struct io_uring_probe *p;
390         int ret;
391
392         p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
393         if (!p)
394                 return;
395
396         memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
397         ret = syscall(__NR_io_uring_register, fd, IORING_REGISTER_PROBE, p, 256);
398         if (ret < 0)
399                 goto out;
400
401         if (IORING_OP_READ > p->ops_len)
402                 goto out;
403
404         if ((p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED))
405                 vectored = 0;
406 out:
407         free(p);
408 }
409
410 static int io_uring_enter(struct submitter *s, unsigned int to_submit,
411                           unsigned int min_complete, unsigned int flags)
412 {
413         return syscall(__NR_io_uring_enter, s->ring_fd, to_submit, min_complete,
414                         flags, NULL, 0);
415 }
416
417 #ifndef CONFIG_HAVE_GETTID
418 static int gettid(void)
419 {
420         return syscall(__NR_gettid);
421 }
422 #endif
423
424 static unsigned file_depth(struct submitter *s)
425 {
426         return (depth + s->nr_files - 1) / s->nr_files;
427 }
428
429 static void init_io(struct submitter *s, unsigned index)
430 {
431         struct io_uring_sqe *sqe = &s->sqes[index];
432         unsigned long offset;
433         struct file *f;
434         long r;
435
436         if (do_nop) {
437                 sqe->opcode = IORING_OP_NOP;
438                 return;
439         }
440
441         if (s->nr_files == 1) {
442                 f = &s->files[0];
443         } else {
444                 f = &s->files[s->cur_file];
445                 if (f->pending_ios >= file_depth(s)) {
446                         s->cur_file++;
447                         if (s->cur_file == s->nr_files)
448                                 s->cur_file = 0;
449                         f = &s->files[s->cur_file];
450                 }
451         }
452         f->pending_ios++;
453
454         r = __rand64(&s->rand_state);
455         offset = (r % (f->max_blocks - 1)) * bs;
456
457         if (register_files) {
458                 sqe->flags = IOSQE_FIXED_FILE;
459                 sqe->fd = f->fixed_fd;
460         } else {
461                 sqe->flags = 0;
462                 sqe->fd = f->real_fd;
463         }
464         if (fixedbufs) {
465                 sqe->opcode = IORING_OP_READ_FIXED;
466                 sqe->addr = (unsigned long) s->iovecs[index].iov_base;
467                 sqe->len = bs;
468                 sqe->buf_index = index;
469         } else if (!vectored) {
470                 sqe->opcode = IORING_OP_READ;
471                 sqe->addr = (unsigned long) s->iovecs[index].iov_base;
472                 sqe->len = bs;
473                 sqe->buf_index = 0;
474         } else {
475                 sqe->opcode = IORING_OP_READV;
476                 sqe->addr = (unsigned long) &s->iovecs[index];
477                 sqe->len = 1;
478                 sqe->buf_index = 0;
479         }
480         sqe->ioprio = 0;
481         sqe->off = offset;
482         sqe->user_data = (unsigned long) f->fileno;
483         if (stats && stats_running)
484                 sqe->user_data |= ((unsigned long)s->clock_index << 32);
485 }
486
487 static int prep_more_ios_uring(struct submitter *s, int max_ios)
488 {
489         struct io_sq_ring *ring = &s->sq_ring;
490         unsigned index, tail, next_tail, prepped = 0;
491
492         next_tail = tail = *ring->tail;
493         do {
494                 next_tail++;
495                 if (next_tail == atomic_load_acquire(ring->head))
496                         break;
497
498                 index = tail & sq_ring_mask;
499                 init_io(s, index);
500                 ring->array[index] = index;
501                 prepped++;
502                 tail = next_tail;
503         } while (prepped < max_ios);
504
505         if (prepped)
506                 atomic_store_release(ring->tail, tail);
507         return prepped;
508 }
509
510 static int get_file_size(struct file *f)
511 {
512         struct stat st;
513
514         if (fstat(f->real_fd, &st) < 0)
515                 return -1;
516         if (S_ISBLK(st.st_mode)) {
517                 unsigned long long bytes;
518
519                 if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
520                         return -1;
521
522                 f->max_blocks = bytes / bs;
523                 return 0;
524         } else if (S_ISREG(st.st_mode)) {
525                 f->max_blocks = st.st_size / bs;
526                 return 0;
527         }
528
529         return -1;
530 }
531
532 static int reap_events_uring(struct submitter *s)
533 {
534         struct io_cq_ring *ring = &s->cq_ring;
535         struct io_uring_cqe *cqe;
536         unsigned head, reaped = 0;
537         int last_idx = -1, stat_nr = 0;
538
539         head = *ring->head;
540         do {
541                 struct file *f;
542
543                 read_barrier();
544                 if (head == atomic_load_acquire(ring->tail))
545                         break;
546                 cqe = &ring->cqes[head & cq_ring_mask];
547                 if (!do_nop) {
548                         int fileno = cqe->user_data & 0xffffffff;
549
550                         f = &s->files[fileno];
551                         f->pending_ios--;
552                         if (cqe->res != bs) {
553                                 printf("io: unexpected ret=%d\n", cqe->res);
554                                 if (polled && cqe->res == -EOPNOTSUPP)
555                                         printf("Your filesystem/driver/kernel doesn't support polled IO\n");
556                                 return -1;
557                         }
558                 }
559                 if (stats) {
560                         int clock_index = cqe->user_data >> 32;
561
562                         if (last_idx != clock_index) {
563                                 if (last_idx != -1) {
564                                         add_stat(s, last_idx, stat_nr);
565                                         stat_nr = 0;
566                                 }
567                                 last_idx = clock_index;
568                         }
569                         stat_nr++;
570                 }
571                 reaped++;
572                 head++;
573         } while (1);
574
575         if (stat_nr)
576                 add_stat(s, last_idx, stat_nr);
577
578         if (reaped) {
579                 s->inflight -= reaped;
580                 atomic_store_release(ring->head, head);
581         }
582         return reaped;
583 }
584
585 static int submitter_init(struct submitter *s)
586 {
587         int i, nr_batch;
588
589         s->tid = gettid();
590         printf("submitter=%d, tid=%d\n", s->index, s->tid);
591
592         __init_rand64(&s->rand_state, pthread_self());
593         srand48(pthread_self());
594
595         for (i = 0; i < MAX_FDS; i++)
596                 s->files[i].fileno = i;
597
598         if (stats) {
599                 nr_batch = roundup_pow2(depth / batch_submit);
600                 if (nr_batch < 2)
601                         nr_batch = 2;
602                 s->clock_batch = calloc(nr_batch, sizeof(unsigned long));
603                 s->clock_index = 1;
604
605                 s->plat = calloc(PLAT_NR, sizeof(unsigned long));
606         } else {
607                 s->clock_batch = NULL;
608                 s->plat = NULL;
609                 nr_batch = 0;
610         }
611
612         return nr_batch;
613 }
614
615 #ifdef CONFIG_LIBAIO
616 static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
617 {
618         unsigned long offset, data;
619         struct file *f;
620         unsigned index;
621         long r;
622
623         index = 0;
624         while (index < max_ios) {
625                 struct iocb *iocb = &iocbs[index];
626
627                 if (s->nr_files == 1) {
628                         f = &s->files[0];
629                 } else {
630                         f = &s->files[s->cur_file];
631                         if (f->pending_ios >= file_depth(s)) {
632                                 s->cur_file++;
633                                 if (s->cur_file == s->nr_files)
634                                         s->cur_file = 0;
635                                 f = &s->files[s->cur_file];
636                         }
637                 }
638                 f->pending_ios++;
639
640                 r = lrand48();
641                 offset = (r % (f->max_blocks - 1)) * bs;
642                 io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base,
643                                 s->iovecs[index].iov_len, offset);
644
645                 data = f->fileno;
646                 if (stats && stats_running)
647                         data |= ((unsigned long) s->clock_index << 32);
648                 iocb->data = (void *) (uintptr_t) data;
649                 index++;
650         }
651         return index;
652 }
653
654 static int reap_events_aio(struct submitter *s, struct io_event *events, int evs)
655 {
656         int last_idx = -1, stat_nr = 0;
657         int reaped = 0;
658
659         while (evs) {
660                 unsigned long data = (uintptr_t) events[reaped].data;
661                 struct file *f = &s->files[data & 0xffffffff];
662
663                 f->pending_ios--;
664                 if (events[reaped].res != bs) {
665                         printf("io: unexpected ret=%ld\n", events[reaped].res);
666                         return -1;
667                 }
668                 if (stats) {
669                         int clock_index = data >> 32;
670
671                         if (last_idx != clock_index) {
672                                 if (last_idx != -1) {
673                                         add_stat(s, last_idx, stat_nr);
674                                         stat_nr = 0;
675                                 }
676                                 last_idx = clock_index;
677                         }
678                         stat_nr++;
679                 }
680                 reaped++;
681                 evs--;
682         }
683
684         if (stat_nr)
685                 add_stat(s, last_idx, stat_nr);
686
687         s->inflight -= reaped;
688         s->done += reaped;
689         return reaped;
690 }
691
692 static void *submitter_aio_fn(void *data)
693 {
694         struct submitter *s = data;
695         int i, ret, prepped, nr_batch;
696         struct iocb **iocbsptr;
697         struct iocb *iocbs;
698         struct io_event *events;
699
700         nr_batch = submitter_init(s);
701
702         iocbsptr = calloc(depth, sizeof(struct iocb *));
703         iocbs = calloc(depth, sizeof(struct iocb));
704         events = calloc(depth, sizeof(struct io_event));
705
706         for (i = 0; i < depth; i++)
707                 iocbsptr[i] = &iocbs[i];
708
709         prepped = 0;
710         do {
711                 int to_wait, to_submit, to_prep;
712
713                 if (!prepped && s->inflight < depth) {
714                         to_prep = min(depth - s->inflight, batch_submit);
715                         prepped = prep_more_ios_aio(s, to_prep, iocbs);
716 #ifdef ARCH_HAVE_CPU_CLOCK
717                         if (prepped && stats) {
718                                 s->clock_batch[s->clock_index] = get_cpu_clock();
719                                 s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
720                         }
721 #endif
722                 }
723                 s->inflight += prepped;
724                 to_submit = prepped;
725
726                 if (to_submit && (s->inflight + to_submit <= depth))
727                         to_wait = 0;
728                 else
729                         to_wait = min(s->inflight + to_submit, batch_complete);
730
731                 ret = io_submit(s->aio_ctx, to_submit, iocbsptr);
732                 s->calls++;
733                 if (ret < 0) {
734                         perror("io_submit");
735                         break;
736                 } else if (ret != to_submit) {
737                         printf("submitted %d, wanted %d\n", ret, to_submit);
738                         break;
739                 }
740                 prepped = 0;
741
742                 while (to_wait) {
743                         int r;
744
745                         s->calls++;
746                         r = io_getevents(s->aio_ctx, to_wait, to_wait, events, NULL);
747                         if (r < 0) {
748                                 perror("io_getevents");
749                                 break;
750                         } else if (r != to_wait) {
751                                 printf("r=%d, wait=%d\n", r, to_wait);
752                                 break;
753                         }
754                         r = reap_events_aio(s, events, r);
755                         s->reaps += r;
756                         to_wait -= r;
757                 }
758         } while (!s->finish);
759
760         free(iocbsptr);
761         free(iocbs);
762         free(events);
763         finish = 1;
764         return NULL;
765 }
766 #endif
767
768 static void *submitter_uring_fn(void *data)
769 {
770         struct submitter *s = data;
771         struct io_sq_ring *ring = &s->sq_ring;
772         int ret, prepped, nr_batch;
773
774         nr_batch = submitter_init(s);
775
776         prepped = 0;
777         do {
778                 int to_wait, to_submit, this_reap, to_prep;
779                 unsigned ring_flags = 0;
780
781                 if (!prepped && s->inflight < depth) {
782                         to_prep = min(depth - s->inflight, batch_submit);
783                         prepped = prep_more_ios_uring(s, to_prep);
784 #ifdef ARCH_HAVE_CPU_CLOCK
785                         if (prepped && stats) {
786                                 s->clock_batch[s->clock_index] = get_cpu_clock();
787                                 s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
788                         }
789 #endif
790                 }
791                 s->inflight += prepped;
792 submit_more:
793                 to_submit = prepped;
794 submit:
795                 if (to_submit && (s->inflight + to_submit <= depth))
796                         to_wait = 0;
797                 else
798                         to_wait = min(s->inflight + to_submit, batch_complete);
799
800                 /*
801                  * Only need to call io_uring_enter if we're not using SQ thread
802                  * poll, or if IORING_SQ_NEED_WAKEUP is set.
803                  */
804                 if (sq_thread_poll)
805                         ring_flags = atomic_load_acquire(ring->flags);
806                 if (!sq_thread_poll || ring_flags & IORING_SQ_NEED_WAKEUP) {
807                         unsigned flags = 0;
808
809                         if (to_wait)
810                                 flags = IORING_ENTER_GETEVENTS;
811                         if (ring_flags & IORING_SQ_NEED_WAKEUP)
812                                 flags |= IORING_ENTER_SQ_WAKEUP;
813                         ret = io_uring_enter(s, to_submit, to_wait, flags);
814                         s->calls++;
815                 } else {
816                         /* for SQPOLL, we submitted it all effectively */
817                         ret = to_submit;
818                 }
819
820                 /*
821                  * For non SQ thread poll, we already got the events we needed
822                  * through the io_uring_enter() above. For SQ thread poll, we
823                  * need to loop here until we find enough events.
824                  */
825                 this_reap = 0;
826                 do {
827                         int r;
828
829                         r = reap_events_uring(s);
830                         if (r == -1) {
831                                 s->finish = 1;
832                                 break;
833                         } else if (r > 0)
834                                 this_reap += r;
835                 } while (sq_thread_poll && this_reap < to_wait);
836                 s->reaps += this_reap;
837
838                 if (ret >= 0) {
839                         if (!ret) {
840                                 to_submit = 0;
841                                 if (s->inflight)
842                                         goto submit;
843                                 continue;
844                         } else if (ret < to_submit) {
845                                 int diff = to_submit - ret;
846
847                                 s->done += ret;
848                                 prepped -= diff;
849                                 goto submit_more;
850                         }
851                         s->done += ret;
852                         prepped = 0;
853                         continue;
854                 } else if (ret < 0) {
855                         if (errno == EAGAIN) {
856                                 if (s->finish)
857                                         break;
858                                 if (this_reap)
859                                         goto submit;
860                                 to_submit = 0;
861                                 goto submit;
862                         }
863                         printf("io_submit: %s\n", strerror(errno));
864                         break;
865                 }
866         } while (!s->finish);
867
868         finish = 1;
869         return NULL;
870 }
871
872 static struct submitter *get_submitter(int offset)
873 {
874         void *ret;
875
876         ret = submitter;
877         if (offset)
878                 ret += offset * (sizeof(*submitter) + depth * sizeof(struct iovec));
879         return ret;
880 }
881
882 static void do_finish(const char *reason)
883 {
884         int j;
885         printf("Exiting on %s\n", reason);
886         for (j = 0; j < nthreads; j++) {
887                 struct submitter *s = get_submitter(j);
888                 s->finish = 1;
889         }
890         if (max_iops > 100000)
891                 printf("Maximum IOPS=%luK\n", max_iops / 1000);
892         else if (max_iops)
893                 printf("Maximum IOPS=%lu\n", max_iops);
894         finish = 1;
895 }
896
897 static void sig_int(int sig)
898 {
899         do_finish("signal");
900 }
901
902 static void arm_sig_int(void)
903 {
904         struct sigaction act;
905
906         memset(&act, 0, sizeof(act));
907         act.sa_handler = sig_int;
908         act.sa_flags = SA_RESTART;
909         sigaction(SIGINT, &act, NULL);
910
911         /* Windows uses SIGBREAK as a quit signal from other applications */
912 #ifdef WIN32
913         sigaction(SIGBREAK, &act, NULL);
914 #endif
915 }
916
917 static int setup_aio(struct submitter *s)
918 {
919 #ifdef CONFIG_LIBAIO
920         if (polled) {
921                 fprintf(stderr, "aio does not support polled IO\n");
922                 polled = 0;
923         }
924         if (sq_thread_poll) {
925                 fprintf(stderr, "aio does not support SQPOLL IO\n");
926                 sq_thread_poll = 0;
927         }
928         if (do_nop) {
929                 fprintf(stderr, "aio does not support polled IO\n");
930                 do_nop = 0;
931         }
932         if (fixedbufs || register_files) {
933                 fprintf(stderr, "aio does not support registered files or buffers\n");
934                 fixedbufs = register_files = 0;
935         }
936
937         return io_queue_init(depth, &s->aio_ctx);
938 #else
939         fprintf(stderr, "Legacy AIO not available on this system/build\n");
940         errno = EINVAL;
941         return -1;
942 #endif
943 }
944
945 static int setup_ring(struct submitter *s)
946 {
947         struct io_sq_ring *sring = &s->sq_ring;
948         struct io_cq_ring *cring = &s->cq_ring;
949         struct io_uring_params p;
950         int ret, fd;
951         void *ptr;
952
953         memset(&p, 0, sizeof(p));
954
955         if (polled && !do_nop)
956                 p.flags |= IORING_SETUP_IOPOLL;
957         if (sq_thread_poll) {
958                 p.flags |= IORING_SETUP_SQPOLL;
959                 if (sq_thread_cpu != -1) {
960                         p.flags |= IORING_SETUP_SQ_AFF;
961                         p.sq_thread_cpu = sq_thread_cpu;
962                 }
963         }
964
965         fd = io_uring_setup(depth, &p);
966         if (fd < 0) {
967                 perror("io_uring_setup");
968                 return 1;
969         }
970         s->ring_fd = fd;
971
972         io_uring_probe(fd);
973
974         if (fixedbufs) {
975                 struct rlimit rlim;
976
977                 rlim.rlim_cur = RLIM_INFINITY;
978                 rlim.rlim_max = RLIM_INFINITY;
979                 /* ignore potential error, not needed on newer kernels */
980                 setrlimit(RLIMIT_MEMLOCK, &rlim);
981
982                 ret = io_uring_register_buffers(s);
983                 if (ret < 0) {
984                         perror("io_uring_register_buffers");
985                         return 1;
986                 }
987
988                 if (dma_map) {
989                         ret = io_uring_map_buffers(s);
990                         if (ret < 0) {
991                                 perror("io_uring_map_buffers");
992                                 return 1;
993                         }
994                 }
995         }
996
997         if (register_files) {
998                 ret = io_uring_register_files(s);
999                 if (ret < 0) {
1000                         perror("io_uring_register_files");
1001                         return 1;
1002                 }
1003         }
1004
1005         ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
1006                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
1007                         IORING_OFF_SQ_RING);
1008         sring->head = ptr + p.sq_off.head;
1009         sring->tail = ptr + p.sq_off.tail;
1010         sring->ring_mask = ptr + p.sq_off.ring_mask;
1011         sring->ring_entries = ptr + p.sq_off.ring_entries;
1012         sring->flags = ptr + p.sq_off.flags;
1013         sring->array = ptr + p.sq_off.array;
1014         sq_ring_mask = *sring->ring_mask;
1015
1016         s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
1017                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
1018                         IORING_OFF_SQES);
1019
1020         ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
1021                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
1022                         IORING_OFF_CQ_RING);
1023         cring->head = ptr + p.cq_off.head;
1024         cring->tail = ptr + p.cq_off.tail;
1025         cring->ring_mask = ptr + p.cq_off.ring_mask;
1026         cring->ring_entries = ptr + p.cq_off.ring_entries;
1027         cring->cqes = ptr + p.cq_off.cqes;
1028         cq_ring_mask = *cring->ring_mask;
1029         return 0;
1030 }
1031
1032 static void file_depths(char *buf)
1033 {
1034         bool prev = false;
1035         char *p;
1036         int i, j;
1037
1038         buf[0] = '\0';
1039         p = buf;
1040         for (j = 0; j < nthreads; j++) {
1041                 struct submitter *s = get_submitter(j);
1042
1043                 for (i = 0; i < s->nr_files; i++) {
1044                         struct file *f = &s->files[i];
1045
1046                         if (prev)
1047                                 p += sprintf(p, " %d", f->pending_ios);
1048                         else
1049                                 p += sprintf(p, "%d", f->pending_ios);
1050                         prev = true;
1051                 }
1052         }
1053 }
1054
1055 static void usage(char *argv, int status)
1056 {
1057         char runtime_str[16];
1058         snprintf(runtime_str, sizeof(runtime_str), "%d", runtime);
1059         printf("%s [options] -- [filenames]\n"
1060                 " -d <int>  : IO Depth, default %d\n"
1061                 " -s <int>  : Batch submit, default %d\n"
1062                 " -c <int>  : Batch complete, default %d\n"
1063                 " -b <int>  : Block size, default %d\n"
1064                 " -p <bool> : Polled IO, default %d\n"
1065                 " -B <bool> : Fixed buffers, default %d\n"
1066                 " -D <bool> : DMA map fixed buffers, default %d\n"
1067                 " -F <bool> : Register files, default %d\n"
1068                 " -n <int>  : Number of threads, default %d\n"
1069                 " -O <bool> : Use O_DIRECT, default %d\n"
1070                 " -N <bool> : Perform just no-op requests, default %d\n"
1071                 " -t <bool> : Track IO latencies, default %d\n"
1072                 " -T <int>  : TSC rate in HZ\n"
1073                 " -a <bool> : Use legacy aio, default %d\n"
1074                 " -r <int>  : Runtime in seconds, default %s\n",
1075                 argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
1076                 fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
1077                 stats, aio, runtime == 0 ? "unlimited" : runtime_str);
1078         exit(status);
1079 }
1080
1081 static void read_tsc_rate(void)
1082 {
1083         char buffer[32];
1084         int fd, ret;
1085
1086         if (tsc_rate)
1087                 return;
1088
1089         fd = open(TSC_RATE_FILE, O_RDONLY);
1090         if (fd < 0)
1091                 return;
1092
1093         ret = read(fd, buffer, sizeof(buffer));
1094         if (ret < 0) {
1095                 close(fd);
1096                 return;
1097         }
1098
1099         tsc_rate = strtoul(buffer, NULL, 10);
1100         printf("Using TSC rate %luHz\n", tsc_rate);
1101         close(fd);
1102 }
1103
1104 static void write_tsc_rate(void)
1105 {
1106         char buffer[32];
1107         struct stat sb;
1108         int fd, ret;
1109
1110         if (!stat(TSC_RATE_FILE, &sb))
1111                 return;
1112
1113         fd = open(TSC_RATE_FILE, O_WRONLY | O_CREAT, 0644);
1114         if (fd < 0)
1115                 return;
1116
1117         memset(buffer, 0, sizeof(buffer));
1118         sprintf(buffer, "%lu", tsc_rate);
1119         ret = write(fd, buffer, strlen(buffer));
1120         if (ret < 0)
1121                 perror("write");
1122         close(fd);
1123 }
1124
1125 int main(int argc, char *argv[])
1126 {
1127         struct submitter *s;
1128         unsigned long done, calls, reap;
1129         int err, i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
1130         struct file f;
1131         char *fdepths;
1132         void *ret;
1133
1134         if (!do_nop && argc < 2)
1135                 usage(argv[0], 1);
1136
1137         while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:h?")) != -1) {
1138                 switch (opt) {
1139                 case 'a':
1140                         aio = !!atoi(optarg);
1141                         break;
1142                 case 'd':
1143                         depth = atoi(optarg);
1144                         break;
1145                 case 's':
1146                         batch_submit = atoi(optarg);
1147                         if (!batch_submit)
1148                                 batch_submit = 1;
1149                         break;
1150                 case 'c':
1151                         batch_complete = atoi(optarg);
1152                         if (!batch_complete)
1153                                 batch_complete = 1;
1154                         break;
1155                 case 'b':
1156                         bs = atoi(optarg);
1157                         break;
1158                 case 'p':
1159                         polled = !!atoi(optarg);
1160                         break;
1161                 case 'B':
1162                         fixedbufs = !!atoi(optarg);
1163                         break;
1164                 case 'F':
1165                         register_files = !!atoi(optarg);
1166                         break;
1167                 case 'n':
1168                         nthreads = atoi(optarg);
1169                         if (!nthreads) {
1170                                 printf("Threads must be non-zero\n");
1171                                 usage(argv[0], 1);
1172                         }
1173                         break;
1174                 case 'N':
1175                         do_nop = !!atoi(optarg);
1176                         break;
1177                 case 'O':
1178                         buffered = !atoi(optarg);
1179                         break;
1180                 case 't':
1181 #ifndef ARCH_HAVE_CPU_CLOCK
1182                         fprintf(stderr, "Stats not supported on this CPU\n");
1183                         return 1;
1184 #endif
1185                         stats = !!atoi(optarg);
1186                         break;
1187                 case 'T':
1188 #ifndef ARCH_HAVE_CPU_CLOCK
1189                         fprintf(stderr, "Stats not supported on this CPU\n");
1190                         return 1;
1191 #endif
1192                         tsc_rate = strtoul(optarg, NULL, 10);
1193                         write_tsc_rate();
1194                         break;
1195                 case 'r':
1196                         runtime = atoi(optarg);
1197                         break;
1198                 case 'D':
1199                         dma_map = !!atoi(optarg);
1200                         break;
1201                 case 'h':
1202                 case '?':
1203                 default:
1204                         usage(argv[0], 0);
1205                         break;
1206                 }
1207         }
1208
1209         if (stats)
1210                 read_tsc_rate();
1211
1212         if (batch_complete > depth)
1213                 batch_complete = depth;
1214         if (batch_submit > depth)
1215                 batch_submit = depth;
1216         if (!fixedbufs && dma_map)
1217                 dma_map = 0;
1218
1219         submitter = calloc(nthreads, sizeof(*submitter) +
1220                                 depth * sizeof(struct iovec));
1221         for (j = 0; j < nthreads; j++) {
1222                 s = get_submitter(j);
1223                 s->index = j;
1224                 s->done = s->calls = s->reaps = 0;
1225         }
1226
1227         flags = O_RDONLY | O_NOATIME;
1228         if (!buffered)
1229                 flags |= O_DIRECT;
1230
1231         j = 0;
1232         i = optind;
1233         nfiles = argc - i;
1234         if (!do_nop) {
1235                 if (!nfiles) {
1236                         printf("No files specified\n");
1237                         usage(argv[0], 1);
1238                 }
1239                 threads_per_f = nthreads / nfiles;
1240                 /* make sure each thread gets assigned files */
1241                 if (threads_per_f == 0) {
1242                         threads_per_f = 1;
1243                 } else {
1244                         threads_rem = nthreads - threads_per_f * nfiles;
1245                 }
1246         }
1247         while (!do_nop && i < argc) {
1248                 int k, limit;
1249
1250                 memset(&f, 0, sizeof(f));
1251
1252                 fd = open(argv[i], flags);
1253                 if (fd < 0) {
1254                         perror("open");
1255                         return 1;
1256                 }
1257                 f.real_fd = fd;
1258                 if (get_file_size(&f)) {
1259                         printf("failed getting size of device/file\n");
1260                         return 1;
1261                 }
1262                 if (f.max_blocks <= 1) {
1263                         printf("Zero file/device size?\n");
1264                         return 1;
1265                 }
1266                 f.max_blocks--;
1267
1268                 limit = threads_per_f;
1269                 limit += threads_rem > 0 ? 1 : 0;
1270                 for (k = 0; k < limit; k++) {
1271                         s = get_submitter((j + k) % nthreads);
1272
1273                         if (s->nr_files == MAX_FDS) {
1274                                 printf("Max number of files (%d) reached\n", MAX_FDS);
1275                                 break;
1276                         }
1277
1278                         memcpy(&s->files[s->nr_files], &f, sizeof(f));
1279
1280                         printf("Added file %s (submitter %d)\n", argv[i], s->index);
1281                         s->nr_files++;
1282                 }
1283                 threads_rem--;
1284                 i++;
1285                 j += limit;
1286         }
1287
1288         arm_sig_int();
1289
1290         for (j = 0; j < nthreads; j++) {
1291                 s = get_submitter(j);
1292                 for (i = 0; i < depth; i++) {
1293                         void *buf;
1294
1295                         if (posix_memalign(&buf, bs, bs)) {
1296                                 printf("failed alloc\n");
1297                                 return 1;
1298                         }
1299                         s->iovecs[i].iov_base = buf;
1300                         s->iovecs[i].iov_len = bs;
1301                 }
1302         }
1303
1304         for (j = 0; j < nthreads; j++) {
1305                 s = get_submitter(j);
1306
1307                 if (!aio)
1308                         err = setup_ring(s);
1309                 else
1310                         err = setup_aio(s);
1311                 if (err) {
1312                         printf("ring setup failed: %s, %d\n", strerror(errno), err);
1313                         return 1;
1314                 }
1315         }
1316         s = get_submitter(0);
1317         printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
1318         if (!aio)
1319                 printf("Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
1320         else
1321                 printf("Engine=aio\n");
1322
1323         for (j = 0; j < nthreads; j++) {
1324                 s = get_submitter(j);
1325                 if (!aio)
1326                         pthread_create(&s->thread, NULL, submitter_uring_fn, s);
1327 #ifdef CONFIG_LIBAIO
1328                 else
1329                         pthread_create(&s->thread, NULL, submitter_aio_fn, s);
1330 #endif
1331         }
1332
1333         fdepths = malloc(8 * s->nr_files * nthreads);
1334         reap = calls = done = 0;
1335         do {
1336                 unsigned long this_done = 0;
1337                 unsigned long this_reap = 0;
1338                 unsigned long this_call = 0;
1339                 unsigned long rpc = 0, ipc = 0;
1340                 unsigned long iops, bw;
1341
1342                 sleep(1);
1343                 if (runtime && !--runtime)
1344                         do_finish("timeout");
1345
1346                 /* don't print partial run, if interrupted by signal */
1347                 if (finish)
1348                         break;
1349
1350                 /* one second in to the run, enable stats */
1351                 if (stats)
1352                         stats_running = 1;
1353
1354                 for (j = 0; j < nthreads; j++) {
1355                         s = get_submitter(j);
1356                         this_done += s->done;
1357                         this_call += s->calls;
1358                         this_reap += s->reaps;
1359                 }
1360                 if (this_call - calls) {
1361                         rpc = (this_done - done) / (this_call - calls);
1362                         ipc = (this_reap - reap) / (this_call - calls);
1363                 } else
1364                         rpc = ipc = -1;
1365                 file_depths(fdepths);
1366                 iops = this_done - done;
1367                 if (bs > 1048576)
1368                         bw = iops * (bs / 1048576);
1369                 else
1370                         bw = iops / (1048576 / bs);
1371                 if (iops > 100000)
1372                         printf("IOPS=%luK, ", iops / 1000);
1373                 else
1374                         printf("IOPS=%lu, ", iops);
1375                 max_iops = max(max_iops, iops);
1376                 if (!do_nop)
1377                         printf("BW=%luMiB/s, ", bw);
1378                 printf("IOS/call=%ld/%ld, inflight=(%s)\n", rpc, ipc, fdepths);
1379                 done = this_done;
1380                 calls = this_call;
1381                 reap = this_reap;
1382         } while (!finish);
1383
1384         for (j = 0; j < nthreads; j++) {
1385                 s = get_submitter(j);
1386                 pthread_join(s->thread, &ret);
1387                 close(s->ring_fd);
1388
1389                 if (stats) {
1390                         unsigned long nr;
1391
1392                         printf("%d: Latency percentiles:\n", s->tid);
1393                         for (i = 0, nr = 0; i < PLAT_NR; i++)
1394                                 nr += s->plat[i];
1395                         show_clat_percentiles(s->plat, nr, 4);
1396                         free(s->clock_batch);
1397                         free(s->plat);
1398                 }
1399         }
1400
1401         free(fdepths);
1402         free(submitter);
1403         return 0;
1404 }