test: add basic test for io_uring ioengine
[fio.git] / t / io_uring.c
CommitLineData
c9fb4c5b
JA
1#include <stdio.h>
2#include <errno.h>
3#include <assert.h>
4#include <stdlib.h>
5#include <stddef.h>
6#include <signal.h>
7#include <inttypes.h>
932131c9 8#include <math.h>
c9fb4c5b 9
256714ea
JA
10#ifdef CONFIG_LIBAIO
11#include <libaio.h>
12#endif
13
4b9e13dc
JA
14#ifdef CONFIG_LIBNUMA
15#include <numa.h>
16#endif
17
c9fb4c5b
JA
18#include <sys/types.h>
19#include <sys/stat.h>
20#include <sys/ioctl.h>
21#include <sys/syscall.h>
22#include <sys/resource.h>
c3e2fc25 23#include <sys/mman.h>
e31b8288 24#include <sys/uio.h>
c9fb4c5b
JA
25#include <linux/fs.h>
26#include <fcntl.h>
27#include <unistd.h>
c9fb4c5b
JA
28#include <string.h>
29#include <pthread.h>
30#include <sched.h>
31
74efb029 32#include "../arch/arch.h"
57fa61f0 33#include "../lib/types.h"
932131c9 34#include "../lib/roundup.h"
9eff5320 35#include "../lib/rand.h"
932131c9 36#include "../minmax.h"
f3e769a4 37#include "../os/linux/io_uring.h"
7d04588a 38#include "../engines/nvme.h"
ac122fea 39
e31b8288 40struct io_sq_ring {
e2239016
JA
41 unsigned *head;
42 unsigned *tail;
43 unsigned *ring_mask;
44 unsigned *ring_entries;
ce1705de 45 unsigned *flags;
e2239016 46 unsigned *array;
c9fb4c5b
JA
47};
48
e31b8288 49struct io_cq_ring {
e2239016
JA
50 unsigned *head;
51 unsigned *tail;
52 unsigned *ring_mask;
53 unsigned *ring_entries;
f0403f94 54 struct io_uring_cqe *cqes;
c9fb4c5b
JA
55};
56
701d1277 57#define DEPTH 128
2e7888ef
JA
58#define BATCH_SUBMIT 32
59#define BATCH_COMPLETE 32
c9fb4c5b
JA
60#define BS 4096
61
a7086591
JA
62#define MAX_FDS 16
63
c3e2fc25 64static unsigned sq_ring_mask, cq_ring_mask;
e39c34dc 65
a7086591
JA
66struct file {
67 unsigned long max_blocks;
beda9d8d
JA
68 unsigned long max_size;
69 unsigned long cur_off;
701d1277 70 unsigned pending_ios;
7d04588a
AG
71 unsigned int nsid; /* nsid field required for nvme-passthrough */
72 unsigned int lba_shift; /* lba_shift field required for nvme-passthrough */
48e698fa
JA
73 int real_fd;
74 int fixed_fd;
932131c9 75 int fileno;
a7086591
JA
76};
77
932131c9
JA
78#define PLAT_BITS 6
79#define PLAT_VAL (1 << PLAT_BITS)
80#define PLAT_GROUP_NR 29
81#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL)
82
c9fb4c5b
JA
83struct submitter {
84 pthread_t thread;
f310970e 85 int ring_fd;
ca8c91c5 86 int enter_ring_fd;
54319661 87 int index;
e31b8288 88 struct io_sq_ring sq_ring;
f0403f94 89 struct io_uring_sqe *sqes;
e31b8288 90 struct io_cq_ring cq_ring;
c9fb4c5b 91 int inflight;
932131c9 92 int tid;
c9fb4c5b
JA
93 unsigned long reaps;
94 unsigned long done;
95 unsigned long calls;
96 volatile int finish;
701d1277 97
48e698fa
JA
98 __s32 *fds;
99
9eff5320
JA
100 struct taus258_state rand_state;
101
932131c9
JA
102 unsigned long *clock_batch;
103 int clock_index;
104 unsigned long *plat;
105
256714ea
JA
106#ifdef CONFIG_LIBAIO
107 io_context_t aio_ctx;
108#endif
109
4b9e13dc
JA
110 int numa_node;
111 const char *filename;
112
a7086591
JA
113 struct file files[MAX_FDS];
114 unsigned nr_files;
115 unsigned cur_file;
e39863e3 116 struct iovec iovecs[];
c9fb4c5b
JA
117};
118
e39863e3 119static struct submitter *submitter;
c9fb4c5b 120static volatile int finish;
52479d8b 121static int stats_running;
16d25711 122static unsigned long max_iops;
c409e4c2 123static long t_io_uring_page_size;
c9fb4c5b 124
e39863e3
KB
125static int depth = DEPTH;
126static int batch_submit = BATCH_SUBMIT;
127static int batch_complete = BATCH_COMPLETE;
5bd526f2 128static int bs = BS;
f0403f94 129static int polled = 1; /* use IO polling */
701d1277 130static int fixedbufs = 1; /* use fixed user buffers */
a71ad043 131static int dma_map; /* pre-map DMA buffers */
8c5fa755 132static int register_files = 1; /* use fixed files */
f0403f94 133static int buffered = 0; /* use buffered IO, not O_DIRECT */
3d7d00a3
JA
134static int sq_thread_poll = 0; /* use kernel submission/poller thread */
135static int sq_thread_cpu = -1; /* pin above thread to this CPU */
8025517d 136static int do_nop = 0; /* no-op SQ ring commands */
54319661 137static int nthreads = 1;
932131c9 138static int stats = 0; /* generate IO stats */
256714ea 139static int aio = 0; /* use libaio */
beda9d8d
JA
140static int runtime = 0; /* runtime */
141static int random_io = 1; /* random or sequential IO */
ca8c91c5 142static int register_ring = 1; /* register ring */
379406bc 143static int use_sync = 0; /* use preadv2 */
4b9e13dc 144static int numa_placement = 0; /* set to node of device */
7d04588a 145static int pt = 0; /* passthrough I/O or not */
256714ea 146
932131c9 147static unsigned long tsc_rate;
c9fb4c5b 148
203e4c26
JA
149#define TSC_RATE_FILE "tsc-rate"
150
84106576 151static int vectored = 1;
b3915995 152
932131c9 153static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0,
ad45a465 154 80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 };
932131c9
JA
155static int plist_len = 17;
156
a71ad043 157#ifndef IORING_REGISTER_MAP_BUFFERS
8538256c 158#define IORING_REGISTER_MAP_BUFFERS 22
a71ad043
JA
159struct io_uring_map_buffers {
160 __s32 fd;
161 __u32 buf_start;
162 __u32 buf_end;
702b0be2
JA
163 __u32 flags;
164 __u64 rsvd[2];
a71ad043
JA
165};
166#endif
167
7d04588a
AG
168static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
169 enum nvme_csi csi, void *data)
170{
171 struct nvme_passthru_cmd cmd = {
172 .opcode = nvme_admin_identify,
173 .nsid = nsid,
174 .addr = (__u64)(uintptr_t)data,
175 .data_len = NVME_IDENTIFY_DATA_SIZE,
176 .cdw10 = cns,
177 .cdw11 = csi << NVME_IDENTIFY_CSI_SHIFT,
178 .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT,
179 };
180
181 return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
182}
183
184static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba)
185{
186 struct nvme_id_ns ns;
187 int namespace_id;
188 int err;
189
190 namespace_id = ioctl(fd, NVME_IOCTL_ID);
191 if (namespace_id < 0) {
192 fprintf(stderr, "error failed to fetch namespace-id\n");
193 close(fd);
194 return -errno;
195 }
196
197 /*
198 * Identify namespace to get namespace-id, namespace size in LBA's
199 * and LBA data size.
200 */
201 err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
202 NVME_CSI_NVM, &ns);
203 if (err) {
204 fprintf(stderr, "error failed to fetch identify namespace\n");
205 close(fd);
206 return err;
207 }
208
209 *nsid = namespace_id;
210 *lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds;
211 *nlba = ns.nsze;
212
213 return 0;
214}
215
932131c9
JA
216static unsigned long cycles_to_nsec(unsigned long cycles)
217{
218 uint64_t val;
219
220 if (!tsc_rate)
221 return cycles;
222
223 val = cycles * 1000000000ULL;
224 return val / tsc_rate;
225}
226
227static unsigned long plat_idx_to_val(unsigned int idx)
228{
229 unsigned int error_bits;
230 unsigned long k, base;
231
232 assert(idx < PLAT_NR);
233
234 /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
235 * all bits of the sample as index */
236 if (idx < (PLAT_VAL << 1))
237 return cycles_to_nsec(idx);
238
239 /* Find the group and compute the minimum value of that group */
240 error_bits = (idx >> PLAT_BITS) - 1;
241 base = ((unsigned long) 1) << (error_bits + PLAT_BITS);
242
243 /* Find its bucket number of the group */
244 k = idx % PLAT_VAL;
245
246 /* Return the mean of the range of the bucket */
247 return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits)));
248}
249
c409e4c2
AG
250unsigned int calculate_clat_percentiles(unsigned long *io_u_plat,
251 unsigned long nr, unsigned long **output,
252 unsigned long *maxv, unsigned long *minv)
932131c9
JA
253{
254 unsigned long sum = 0;
255 unsigned int len = plist_len, i, j = 0;
256 unsigned long *ovals = NULL;
257 bool is_last;
258
bb209d68 259 *minv = -1UL;
932131c9
JA
260 *maxv = 0;
261
262 ovals = malloc(len * sizeof(*ovals));
263 if (!ovals)
264 return 0;
265
266 /*
267 * Calculate bucket values, note down max and min values
268 */
269 is_last = false;
270 for (i = 0; i < PLAT_NR && !is_last; i++) {
271 sum += io_u_plat[i];
272 while (sum >= ((long double) plist[j] / 100.0 * nr)) {
273 assert(plist[j] <= 100.0);
274
275 ovals[j] = plat_idx_to_val(i);
276 if (ovals[j] < *minv)
277 *minv = ovals[j];
278 if (ovals[j] > *maxv)
279 *maxv = ovals[j];
280
281 is_last = (j == len - 1) != 0;
282 if (is_last)
283 break;
284
285 j++;
286 }
287 }
288
289 if (!is_last)
290 fprintf(stderr, "error calculating latency percentiles\n");
291
292 *output = ovals;
293 return len;
294}
295
296static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
297 unsigned int precision)
298{
299 unsigned int divisor, len, i, j = 0;
300 unsigned long minv, maxv;
301 unsigned long *ovals;
302 int per_line, scale_down, time_width;
303 bool is_last;
304 char fmt[32];
305
c409e4c2 306 len = calculate_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv);
932131c9
JA
307 if (!len || !ovals)
308 goto out;
309
310 if (!tsc_rate) {
311 scale_down = 0;
312 divisor = 1;
313 printf(" percentiles (tsc ticks):\n |");
314 } else if (minv > 2000 && maxv > 99999) {
315 scale_down = 1;
316 divisor = 1000;
317 printf(" percentiles (usec):\n |");
318 } else {
319 scale_down = 0;
320 divisor = 1;
321 printf(" percentiles (nsec):\n |");
322 }
323
324 time_width = max(5, (int) (log10(maxv / divisor) + 1));
325 snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
326 precision, time_width);
327 /* fmt will be something like " %5.2fth=[%4llu]%c" */
328 per_line = (80 - 7) / (precision + 10 + time_width);
329
330 for (j = 0; j < len; j++) {
331 /* for formatting */
332 if (j != 0 && (j % per_line) == 0)
333 printf(" |");
334
335 /* end of the list */
336 is_last = (j == len - 1) != 0;
337
338 for (i = 0; i < scale_down; i++)
339 ovals[j] = (ovals[j] + 999) / 1000;
340
341 printf(fmt, plist[j], ovals[j], is_last ? '\n' : ',');
342
343 if (is_last)
344 break;
345
346 if ((j % per_line) == per_line - 1) /* for formatting */
347 printf("\n");
348 }
349
350out:
351 free(ovals);
352}
353
b65c1fc0 354#ifdef ARCH_HAVE_CPU_CLOCK
932131c9
JA
355static unsigned int plat_val_to_idx(unsigned long val)
356{
357 unsigned int msb, error_bits, base, offset, idx;
358
359 /* Find MSB starting from bit 0 */
360 if (val == 0)
361 msb = 0;
362 else
363 msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
364
365 /*
366 * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
367 * all bits of the sample as index
368 */
369 if (msb <= PLAT_BITS)
370 return val;
371
372 /* Compute the number of error bits to discard*/
373 error_bits = msb - PLAT_BITS;
374
375 /* Compute the number of buckets before the group */
376 base = (error_bits + 1) << PLAT_BITS;
377
378 /*
379 * Discard the error bits and apply the mask to find the
380 * index for the buckets in the group
381 */
382 offset = (PLAT_VAL - 1) & (val >> error_bits);
383
384 /* Make sure the index does not exceed (array size - 1) */
385 idx = (base + offset) < (PLAT_NR - 1) ?
386 (base + offset) : (PLAT_NR - 1);
387
388 return idx;
389}
b65c1fc0 390#endif
932131c9
JA
391
392static void add_stat(struct submitter *s, int clock_index, int nr)
393{
394#ifdef ARCH_HAVE_CPU_CLOCK
395 unsigned long cycles;
396 unsigned int pidx;
397
265697fc
JA
398 if (!s->finish && clock_index) {
399 cycles = get_cpu_clock();
400 cycles -= s->clock_batch[clock_index];
401 pidx = plat_val_to_idx(cycles);
402 s->plat[pidx] += nr;
403 }
932131c9
JA
404#endif
405}
406
a71ad043
JA
407static int io_uring_map_buffers(struct submitter *s)
408{
409 struct io_uring_map_buffers map = {
410 .fd = s->files[0].real_fd,
a71ad043 411 .buf_end = depth,
a71ad043
JA
412 };
413
414 if (do_nop)
415 return 0;
d49885aa
JA
416 if (s->nr_files > 1)
417 fprintf(stdout, "Mapping buffers may not work with multiple files\n");
a71ad043
JA
418
419 return syscall(__NR_io_uring_register, s->ring_fd,
420 IORING_REGISTER_MAP_BUFFERS, &map, 1);
421}
422
2ea53ca3 423static int io_uring_register_buffers(struct submitter *s)
c9fb4c5b 424{
8025517d
JA
425 if (do_nop)
426 return 0;
427
bfed648c 428 return syscall(__NR_io_uring_register, s->ring_fd,
55845033 429 IORING_REGISTER_BUFFERS, s->iovecs, roundup_pow2(depth));
2ea53ca3
JA
430}
431
a7abc9fb
JA
432static int io_uring_register_files(struct submitter *s)
433{
48e698fa 434 int i;
a7abc9fb 435
8025517d
JA
436 if (do_nop)
437 return 0;
438
48e698fa
JA
439 s->fds = calloc(s->nr_files, sizeof(__s32));
440 for (i = 0; i < s->nr_files; i++) {
441 s->fds[i] = s->files[i].real_fd;
442 s->files[i].fixed_fd = i;
443 }
a7abc9fb 444
bfed648c 445 return syscall(__NR_io_uring_register, s->ring_fd,
919850d2 446 IORING_REGISTER_FILES, s->fds, s->nr_files);
a7abc9fb
JA
447}
448
2ea53ca3
JA
449static int io_uring_setup(unsigned entries, struct io_uring_params *p)
450{
1db268db
JA
451 /*
452 * Clamp CQ ring size at our SQ ring size, we don't need more entries
453 * than that.
454 */
455 p->flags |= IORING_SETUP_CQSIZE;
456 p->cq_entries = entries;
457
bfed648c 458 return syscall(__NR_io_uring_setup, entries, p);
c9fb4c5b
JA
459}
460
b3915995
JA
461static void io_uring_probe(int fd)
462{
463 struct io_uring_probe *p;
464 int ret;
465
466 p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
467 if (!p)
468 return;
469
470 memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
471 ret = syscall(__NR_io_uring_register, fd, IORING_REGISTER_PROBE, p, 256);
472 if (ret < 0)
473 goto out;
474
475 if (IORING_OP_READ > p->ops_len)
476 goto out;
477
478 if ((p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED))
84106576 479 vectored = 0;
b3915995
JA
480out:
481 free(p);
482}
483
c3e2fc25
JA
484static int io_uring_enter(struct submitter *s, unsigned int to_submit,
485 unsigned int min_complete, unsigned int flags)
c9fb4c5b 486{
ca8c91c5
JA
487 if (register_ring)
488 flags |= IORING_ENTER_REGISTERED_RING;
c377f4f8 489#ifdef FIO_ARCH_HAS_SYSCALL
ca8c91c5 490 return __do_syscall6(__NR_io_uring_enter, s->enter_ring_fd, to_submit,
c377f4f8
JA
491 min_complete, flags, NULL, 0);
492#else
ca8c91c5
JA
493 return syscall(__NR_io_uring_enter, s->enter_ring_fd, to_submit,
494 min_complete, flags, NULL, 0);
c377f4f8 495#endif
c9fb4c5b
JA
496}
497
77d814a1 498#ifndef CONFIG_HAVE_GETTID
c9fb4c5b
JA
499static int gettid(void)
500{
501 return syscall(__NR_gettid);
502}
77d814a1 503#endif
c9fb4c5b 504
701d1277
JA
505static unsigned file_depth(struct submitter *s)
506{
e39863e3 507 return (depth + s->nr_files - 1) / s->nr_files;
701d1277
JA
508}
509
a7086591 510static void init_io(struct submitter *s, unsigned index)
c9fb4c5b 511{
f0403f94 512 struct io_uring_sqe *sqe = &s->sqes[index];
c9fb4c5b 513 unsigned long offset;
a7086591 514 struct file *f;
c9fb4c5b
JA
515 long r;
516
8025517d
JA
517 if (do_nop) {
518 sqe->opcode = IORING_OP_NOP;
519 return;
520 }
521
701d1277
JA
522 if (s->nr_files == 1) {
523 f = &s->files[0];
524 } else {
525 f = &s->files[s->cur_file];
526 if (f->pending_ios >= file_depth(s)) {
527 s->cur_file++;
528 if (s->cur_file == s->nr_files)
529 s->cur_file = 0;
93d1811c 530 f = &s->files[s->cur_file];
701d1277
JA
531 }
532 }
533 f->pending_ios++;
a7086591 534
beda9d8d
JA
535 if (random_io) {
536 r = __rand64(&s->rand_state);
537 offset = (r % (f->max_blocks - 1)) * bs;
538 } else {
539 offset = f->cur_off;
540 f->cur_off += bs;
541 if (f->cur_off + bs > f->max_size)
542 f->cur_off = 0;
543 }
c9fb4c5b 544
8c5fa755
JA
545 if (register_files) {
546 sqe->flags = IOSQE_FIXED_FILE;
547 sqe->fd = f->fixed_fd;
548 } else {
549 sqe->flags = 0;
550 sqe->fd = f->real_fd;
551 }
f0403f94 552 if (fixedbufs) {
48e698fa 553 sqe->opcode = IORING_OP_READ_FIXED;
919850d2 554 sqe->addr = (unsigned long) s->iovecs[index].iov_base;
5bd526f2 555 sqe->len = bs;
2ea53ca3 556 sqe->buf_index = index;
84106576 557 } else if (!vectored) {
b3915995
JA
558 sqe->opcode = IORING_OP_READ;
559 sqe->addr = (unsigned long) s->iovecs[index].iov_base;
560 sqe->len = bs;
561 sqe->buf_index = 0;
84106576
JA
562 } else {
563 sqe->opcode = IORING_OP_READV;
564 sqe->addr = (unsigned long) &s->iovecs[index];
565 sqe->len = 1;
566 sqe->buf_index = 0;
f0403f94 567 }
f0403f94 568 sqe->ioprio = 0;
f0403f94 569 sqe->off = offset;
932131c9 570 sqe->user_data = (unsigned long) f->fileno;
52479d8b 571 if (stats && stats_running)
bb209d68 572 sqe->user_data |= ((uint64_t)s->clock_index << 32);
c9fb4c5b
JA
573}
574
7d04588a
AG
575static void init_io_pt(struct submitter *s, unsigned index)
576{
577 struct io_uring_sqe *sqe = &s->sqes[index << 1];
578 unsigned long offset;
579 struct file *f;
580 struct nvme_uring_cmd *cmd;
581 unsigned long long slba;
582 unsigned long long nlb;
583 long r;
584
585 if (s->nr_files == 1) {
586 f = &s->files[0];
587 } else {
588 f = &s->files[s->cur_file];
589 if (f->pending_ios >= file_depth(s)) {
590 s->cur_file++;
591 if (s->cur_file == s->nr_files)
592 s->cur_file = 0;
593 f = &s->files[s->cur_file];
594 }
595 }
596 f->pending_ios++;
597
598 if (random_io) {
599 r = __rand64(&s->rand_state);
600 offset = (r % (f->max_blocks - 1)) * bs;
601 } else {
602 offset = f->cur_off;
603 f->cur_off += bs;
604 if (f->cur_off + bs > f->max_size)
605 f->cur_off = 0;
606 }
607
608 if (register_files) {
609 sqe->fd = f->fixed_fd;
610 sqe->flags = IOSQE_FIXED_FILE;
611 } else {
612 sqe->fd = f->real_fd;
613 sqe->flags = 0;
614 }
615 sqe->opcode = IORING_OP_URING_CMD;
616 sqe->user_data = (unsigned long) f->fileno;
617 if (stats)
9ce84fbd 618 sqe->user_data |= ((__u64) s->clock_index << 32ULL);
7d04588a
AG
619 sqe->cmd_op = NVME_URING_CMD_IO;
620 slba = offset >> f->lba_shift;
621 nlb = (bs >> f->lba_shift) - 1;
622 cmd = (struct nvme_uring_cmd *)&sqe->cmd;
623 /* cdw10 and cdw11 represent starting slba*/
624 cmd->cdw10 = slba & 0xffffffff;
625 cmd->cdw11 = slba >> 32;
626 /* cdw12 represent number of lba to be read*/
627 cmd->cdw12 = nlb;
628 cmd->addr = (unsigned long) s->iovecs[index].iov_base;
629 cmd->data_len = bs;
630 cmd->nsid = f->nsid;
631 cmd->opcode = 2;
632}
633
256714ea 634static int prep_more_ios_uring(struct submitter *s, int max_ios)
c9fb4c5b 635{
e31b8288 636 struct io_sq_ring *ring = &s->sq_ring;
e2239016 637 unsigned index, tail, next_tail, prepped = 0;
c9fb4c5b 638
c3e2fc25 639 next_tail = tail = *ring->tail;
c9fb4c5b
JA
640 do {
641 next_tail++;
fc2dc21b 642 if (next_tail == atomic_load_acquire(ring->head))
c9fb4c5b
JA
643 break;
644
e39c34dc 645 index = tail & sq_ring_mask;
7d04588a
AG
646 if (pt)
647 init_io_pt(s, index);
648 else
649 init_io(s, index);
c3e2fc25 650 ring->array[index] = index;
c9fb4c5b
JA
651 prepped++;
652 tail = next_tail;
653 } while (prepped < max_ios);
654
fc2dc21b
JA
655 if (prepped)
656 atomic_store_release(ring->tail, tail);
c9fb4c5b
JA
657 return prepped;
658}
659
a7086591 660static int get_file_size(struct file *f)
c9fb4c5b
JA
661{
662 struct stat st;
663
48e698fa 664 if (fstat(f->real_fd, &st) < 0)
c9fb4c5b 665 return -1;
7d04588a
AG
666 if (pt) {
667 __u64 nlba;
668 __u32 lbs;
669 int ret;
670
671 if (!S_ISCHR(st.st_mode)) {
672 fprintf(stderr, "passthrough works with only nvme-ns "
673 "generic devices (/dev/ngXnY)\n");
674 return -1;
675 }
676 ret = nvme_get_info(f->real_fd, &f->nsid, &lbs, &nlba);
677 if (ret)
678 return -1;
679 if ((bs % lbs) != 0) {
680 printf("error: bs:%d should be a multiple logical_block_size:%d\n",
681 bs, lbs);
682 return -1;
683 }
684 f->max_blocks = nlba / bs;
685 f->max_size = nlba;
686 f->lba_shift = ilog2(lbs);
687 return 0;
688 } else if (S_ISBLK(st.st_mode)) {
c9fb4c5b
JA
689 unsigned long long bytes;
690
48e698fa 691 if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
c9fb4c5b
JA
692 return -1;
693
5bd526f2 694 f->max_blocks = bytes / bs;
beda9d8d 695 f->max_size = bytes;
c9fb4c5b
JA
696 return 0;
697 } else if (S_ISREG(st.st_mode)) {
5bd526f2 698 f->max_blocks = st.st_size / bs;
beda9d8d 699 f->max_size = st.st_size;
c9fb4c5b
JA
700 return 0;
701 }
702
703 return -1;
704}
705
256714ea 706static int reap_events_uring(struct submitter *s)
c9fb4c5b 707{
e31b8288 708 struct io_cq_ring *ring = &s->cq_ring;
f0403f94 709 struct io_uring_cqe *cqe;
e2239016 710 unsigned head, reaped = 0;
ab85494f 711 int last_idx = -1, stat_nr = 0;
c9fb4c5b 712
c3e2fc25 713 head = *ring->head;
c9fb4c5b 714 do {
701d1277
JA
715 struct file *f;
716
679d8352 717 read_barrier();
fc2dc21b 718 if (head == atomic_load_acquire(ring->tail))
c9fb4c5b 719 break;
f0403f94 720 cqe = &ring->cqes[head & cq_ring_mask];
8025517d 721 if (!do_nop) {
932131c9
JA
722 int fileno = cqe->user_data & 0xffffffff;
723
724 f = &s->files[fileno];
8025517d 725 f->pending_ios--;
5bd526f2 726 if (cqe->res != bs) {
8025517d 727 printf("io: unexpected ret=%d\n", cqe->res);
154a9582 728 if (polled && cqe->res == -EOPNOTSUPP)
8066f6b6 729 printf("Your filesystem/driver/kernel doesn't support polled IO\n");
8025517d
JA
730 return -1;
731 }
c9fb4c5b 732 }
932131c9
JA
733 if (stats) {
734 int clock_index = cqe->user_data >> 32;
735
ab85494f
JA
736 if (last_idx != clock_index) {
737 if (last_idx != -1) {
738 add_stat(s, last_idx, stat_nr);
739 stat_nr = 0;
740 }
741 last_idx = clock_index;
d4af2ece
JA
742 }
743 stat_nr++;
932131c9 744 }
c9fb4c5b
JA
745 reaped++;
746 head++;
c9fb4c5b
JA
747 } while (1);
748
ab85494f
JA
749 if (stat_nr)
750 add_stat(s, last_idx, stat_nr);
751
fc2dc21b
JA
752 if (reaped) {
753 s->inflight -= reaped;
754 atomic_store_release(ring->head, head);
755 }
c9fb4c5b
JA
756 return reaped;
757}
758
7d04588a
AG
759static int reap_events_uring_pt(struct submitter *s)
760{
761 struct io_cq_ring *ring = &s->cq_ring;
762 struct io_uring_cqe *cqe;
763 unsigned head, reaped = 0;
764 int last_idx = -1, stat_nr = 0;
765 unsigned index;
766 int fileno;
767
768 head = *ring->head;
769 do {
770 struct file *f;
771
772 read_barrier();
773 if (head == atomic_load_acquire(ring->tail))
774 break;
775 index = head & cq_ring_mask;
776 cqe = &ring->cqes[index << 1];
777 fileno = cqe->user_data & 0xffffffff;
778 f = &s->files[fileno];
779 f->pending_ios--;
780
781 if (cqe->res != 0) {
782 printf("io: unexpected ret=%d\n", cqe->res);
783 if (polled && cqe->res == -EINVAL)
784 printf("passthrough doesn't support polled IO\n");
785 return -1;
786 }
787 if (stats) {
788 int clock_index = cqe->user_data >> 32;
789
790 if (last_idx != clock_index) {
791 if (last_idx != -1) {
792 add_stat(s, last_idx, stat_nr);
793 stat_nr = 0;
794 }
795 last_idx = clock_index;
796 }
797 stat_nr++;
798 }
799 reaped++;
800 head++;
801 } while (1);
802
803 if (stat_nr)
804 add_stat(s, last_idx, stat_nr);
805
806 if (reaped) {
807 s->inflight -= reaped;
808 atomic_store_release(ring->head, head);
809 }
810 return reaped;
811}
812
4b9e13dc
JA
813static void set_affinity(struct submitter *s)
814{
815#ifdef CONFIG_LIBNUMA
816 struct bitmask *mask;
817
818 if (s->numa_node == -1)
819 return;
820
821 numa_set_preferred(s->numa_node);
822
823 mask = numa_allocate_cpumask();
824 numa_node_to_cpus(s->numa_node, mask);
825 numa_sched_setaffinity(s->tid, mask);
826#endif
827}
828
829static int detect_node(struct submitter *s, const char *name)
830{
831#ifdef CONFIG_LIBNUMA
832 const char *base = basename(name);
833 char str[128];
834 int ret, fd, node;
835
836 sprintf(str, "/sys/block/%s/device/numa_node", base);
837 fd = open(str, O_RDONLY);
838 if (fd < 0)
839 return -1;
840
841 ret = read(fd, str, sizeof(str));
842 if (ret < 0) {
843 close(fd);
844 return -1;
845 }
846 node = atoi(str);
847 s->numa_node = node;
848 close(fd);
849#else
850 s->numa_node = -1;
851#endif
852 return 0;
853}
854
855static int setup_aio(struct submitter *s)
856{
857#ifdef CONFIG_LIBAIO
858 if (polled) {
859 fprintf(stderr, "aio does not support polled IO\n");
860 polled = 0;
861 }
862 if (sq_thread_poll) {
863 fprintf(stderr, "aio does not support SQPOLL IO\n");
864 sq_thread_poll = 0;
865 }
866 if (do_nop) {
867 fprintf(stderr, "aio does not support polled IO\n");
868 do_nop = 0;
869 }
870 if (fixedbufs || register_files) {
871 fprintf(stderr, "aio does not support registered files or buffers\n");
872 fixedbufs = register_files = 0;
873 }
874
875 return io_queue_init(roundup_pow2(depth), &s->aio_ctx);
876#else
877 fprintf(stderr, "Legacy AIO not available on this system/build\n");
878 errno = EINVAL;
879 return -1;
880#endif
881}
882
883static int setup_ring(struct submitter *s)
884{
885 struct io_sq_ring *sring = &s->sq_ring;
886 struct io_cq_ring *cring = &s->cq_ring;
887 struct io_uring_params p;
888 int ret, fd;
889 void *ptr;
7d04588a 890 size_t len;
4b9e13dc
JA
891
892 memset(&p, 0, sizeof(p));
893
894 if (polled && !do_nop)
895 p.flags |= IORING_SETUP_IOPOLL;
896 if (sq_thread_poll) {
897 p.flags |= IORING_SETUP_SQPOLL;
898 if (sq_thread_cpu != -1) {
899 p.flags |= IORING_SETUP_SQ_AFF;
900 p.sq_thread_cpu = sq_thread_cpu;
901 }
902 }
7d04588a
AG
903 if (pt) {
904 p.flags |= IORING_SETUP_SQE128;
905 p.flags |= IORING_SETUP_CQE32;
906 }
4b9e13dc
JA
907
908 fd = io_uring_setup(depth, &p);
909 if (fd < 0) {
910 perror("io_uring_setup");
911 return 1;
912 }
913 s->ring_fd = s->enter_ring_fd = fd;
914
915 io_uring_probe(fd);
916
917 if (fixedbufs) {
918 struct rlimit rlim;
919
920 rlim.rlim_cur = RLIM_INFINITY;
921 rlim.rlim_max = RLIM_INFINITY;
922 /* ignore potential error, not needed on newer kernels */
923 setrlimit(RLIMIT_MEMLOCK, &rlim);
924
925 ret = io_uring_register_buffers(s);
926 if (ret < 0) {
927 perror("io_uring_register_buffers");
928 return 1;
929 }
930
931 if (dma_map) {
932 ret = io_uring_map_buffers(s);
933 if (ret < 0) {
934 perror("io_uring_map_buffers");
935 return 1;
936 }
937 }
938 }
939
940 if (register_files) {
941 ret = io_uring_register_files(s);
942 if (ret < 0) {
943 perror("io_uring_register_files");
944 return 1;
945 }
946 }
947
948 ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
949 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
950 IORING_OFF_SQ_RING);
951 sring->head = ptr + p.sq_off.head;
952 sring->tail = ptr + p.sq_off.tail;
953 sring->ring_mask = ptr + p.sq_off.ring_mask;
954 sring->ring_entries = ptr + p.sq_off.ring_entries;
955 sring->flags = ptr + p.sq_off.flags;
956 sring->array = ptr + p.sq_off.array;
957 sq_ring_mask = *sring->ring_mask;
958
7d04588a
AG
959 if (p.flags & IORING_SETUP_SQE128)
960 len = 2 * p.sq_entries * sizeof(struct io_uring_sqe);
961 else
962 len = p.sq_entries * sizeof(struct io_uring_sqe);
963 s->sqes = mmap(0, len,
4b9e13dc
JA
964 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
965 IORING_OFF_SQES);
966
7d04588a
AG
967 if (p.flags & IORING_SETUP_CQE32) {
968 len = p.cq_off.cqes +
969 2 * p.cq_entries * sizeof(struct io_uring_cqe);
970 } else {
971 len = p.cq_off.cqes +
972 p.cq_entries * sizeof(struct io_uring_cqe);
973 }
974 ptr = mmap(0, len,
4b9e13dc
JA
975 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
976 IORING_OFF_CQ_RING);
977 cring->head = ptr + p.cq_off.head;
978 cring->tail = ptr + p.cq_off.tail;
979 cring->ring_mask = ptr + p.cq_off.ring_mask;
980 cring->ring_entries = ptr + p.cq_off.ring_entries;
981 cring->cqes = ptr + p.cq_off.cqes;
982 cq_ring_mask = *cring->ring_mask;
983 return 0;
984}
985
986static void *allocate_mem(struct submitter *s, int size)
987{
988 void *buf;
989
990#ifdef CONFIG_LIBNUMA
991 if (s->numa_node != -1)
992 return numa_alloc_onnode(size, s->numa_node);
993#endif
994
c409e4c2 995 if (posix_memalign(&buf, t_io_uring_page_size, bs)) {
4b9e13dc
JA
996 printf("failed alloc\n");
997 return NULL;
998 }
999
1000 return buf;
1001}
1002
256714ea 1003static int submitter_init(struct submitter *s)
c9fb4c5b 1004{
4b9e13dc
JA
1005 int i, nr_batch, err;
1006 static int init_printed;
1007 char buf[80];
932131c9 1008 s->tid = gettid();
4b9e13dc
JA
1009 printf("submitter=%d, tid=%d, file=%s, node=%d\n", s->index, s->tid,
1010 s->filename, s->numa_node);
1011
1012 set_affinity(s);
c9fb4c5b 1013
6243766b
KR
1014 __init_rand64(&s->rand_state, s->tid);
1015 srand48(s->tid);
c9fb4c5b 1016
932131c9
JA
1017 for (i = 0; i < MAX_FDS; i++)
1018 s->files[i].fileno = i;
1019
4b9e13dc
JA
1020 for (i = 0; i < roundup_pow2(depth); i++) {
1021 void *buf;
1022
1023 buf = allocate_mem(s, bs);
1024 if (!buf)
1025 return 1;
1026 s->iovecs[i].iov_base = buf;
1027 s->iovecs[i].iov_len = bs;
1028 }
1029
1030 if (use_sync) {
1031 sprintf(buf, "Engine=preadv2\n");
1032 err = 0;
1033 } else if (!aio) {
1034 err = setup_ring(s);
1035 sprintf(buf, "Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
1036 } else {
1037 sprintf(buf, "Engine=aio\n");
1038 err = setup_aio(s);
1039 }
1040 if (err) {
1041 printf("queue setup failed: %s, %d\n", strerror(errno), err);
1042 return 1;
1043 }
1044
1045 if (!init_printed) {
1046 printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
1047 printf("%s", buf);
1048 init_printed = 1;
1049 }
1050
932131c9
JA
1051 if (stats) {
1052 nr_batch = roundup_pow2(depth / batch_submit);
d4af2ece
JA
1053 if (nr_batch < 2)
1054 nr_batch = 2;
932131c9 1055 s->clock_batch = calloc(nr_batch, sizeof(unsigned long));
52479d8b 1056 s->clock_index = 1;
932131c9
JA
1057
1058 s->plat = calloc(PLAT_NR, sizeof(unsigned long));
1059 } else {
1060 s->clock_batch = NULL;
1061 s->plat = NULL;
1062 nr_batch = 0;
1063 }
7d04588a
AG
1064 /* perform the expensive command initialization part for passthrough here
1065 * rather than in the fast path
1066 */
1067 if (pt) {
1068 for (i = 0; i < roundup_pow2(depth); i++) {
1069 struct io_uring_sqe *sqe = &s->sqes[i << 1];
932131c9 1070
7d04588a
AG
1071 memset(&sqe->cmd, 0, sizeof(struct nvme_uring_cmd));
1072 }
1073 }
256714ea
JA
1074 return nr_batch;
1075}
1076
1077#ifdef CONFIG_LIBAIO
1078static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
1079{
8310c570
VF
1080 uint64_t data;
1081 long long offset;
256714ea
JA
1082 struct file *f;
1083 unsigned index;
1084 long r;
1085
1086 index = 0;
1087 while (index < max_ios) {
1088 struct iocb *iocb = &iocbs[index];
1089
1090 if (s->nr_files == 1) {
1091 f = &s->files[0];
1092 } else {
1093 f = &s->files[s->cur_file];
1094 if (f->pending_ios >= file_depth(s)) {
1095 s->cur_file++;
1096 if (s->cur_file == s->nr_files)
1097 s->cur_file = 0;
1098 f = &s->files[s->cur_file];
1099 }
1100 }
1101 f->pending_ios++;
1102
1103 r = lrand48();
1104 offset = (r % (f->max_blocks - 1)) * bs;
1105 io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base,
1106 s->iovecs[index].iov_len, offset);
1107
1108 data = f->fileno;
52479d8b 1109 if (stats && stats_running)
8310c570 1110 data |= (((uint64_t) s->clock_index) << 32);
256714ea
JA
1111 iocb->data = (void *) (uintptr_t) data;
1112 index++;
1113 }
1114 return index;
1115}
1116
1117static int reap_events_aio(struct submitter *s, struct io_event *events, int evs)
1118{
1119 int last_idx = -1, stat_nr = 0;
1120 int reaped = 0;
1121
1122 while (evs) {
8310c570 1123 uint64_t data = (uintptr_t) events[reaped].data;
256714ea
JA
1124 struct file *f = &s->files[data & 0xffffffff];
1125
1126 f->pending_ios--;
1127 if (events[reaped].res != bs) {
1128 printf("io: unexpected ret=%ld\n", events[reaped].res);
1129 return -1;
1130 }
1131 if (stats) {
1132 int clock_index = data >> 32;
1133
1134 if (last_idx != clock_index) {
1135 if (last_idx != -1) {
1136 add_stat(s, last_idx, stat_nr);
1137 stat_nr = 0;
1138 }
1139 last_idx = clock_index;
d4af2ece
JA
1140 }
1141 stat_nr++;
256714ea
JA
1142 }
1143 reaped++;
1144 evs--;
1145 }
1146
1147 if (stat_nr)
1148 add_stat(s, last_idx, stat_nr);
1149
1150 s->inflight -= reaped;
1151 s->done += reaped;
1152 return reaped;
1153}
1154
1155static void *submitter_aio_fn(void *data)
1156{
1157 struct submitter *s = data;
71989c1b 1158 int i, ret, prepped;
256714ea
JA
1159 struct iocb **iocbsptr;
1160 struct iocb *iocbs;
1161 struct io_event *events;
71989c1b
JA
1162#ifdef ARCH_HAVE_CPU_CLOCK
1163 int nr_batch = submitter_init(s);
1164#else
1165 submitter_init(s);
1166#endif
256714ea
JA
1167
1168 iocbsptr = calloc(depth, sizeof(struct iocb *));
1169 iocbs = calloc(depth, sizeof(struct iocb));
1170 events = calloc(depth, sizeof(struct io_event));
1171
1172 for (i = 0; i < depth; i++)
1173 iocbsptr[i] = &iocbs[i];
1174
1175 prepped = 0;
1176 do {
1177 int to_wait, to_submit, to_prep;
1178
1179 if (!prepped && s->inflight < depth) {
1180 to_prep = min(depth - s->inflight, batch_submit);
1181 prepped = prep_more_ios_aio(s, to_prep, iocbs);
1182#ifdef ARCH_HAVE_CPU_CLOCK
1183 if (prepped && stats) {
1184 s->clock_batch[s->clock_index] = get_cpu_clock();
1185 s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
1186 }
1187#endif
1188 }
1189 s->inflight += prepped;
1190 to_submit = prepped;
1191
1192 if (to_submit && (s->inflight + to_submit <= depth))
1193 to_wait = 0;
1194 else
1195 to_wait = min(s->inflight + to_submit, batch_complete);
1196
1197 ret = io_submit(s->aio_ctx, to_submit, iocbsptr);
1198 s->calls++;
1199 if (ret < 0) {
1200 perror("io_submit");
1201 break;
1202 } else if (ret != to_submit) {
1203 printf("submitted %d, wanted %d\n", ret, to_submit);
1204 break;
1205 }
1206 prepped = 0;
1207
24a24c12 1208 while (to_wait) {
256714ea
JA
1209 int r;
1210
24a24c12
JA
1211 s->calls++;
1212 r = io_getevents(s->aio_ctx, to_wait, to_wait, events, NULL);
1213 if (r < 0) {
1214 perror("io_getevents");
1215 break;
1216 } else if (r != to_wait) {
1217 printf("r=%d, wait=%d\n", r, to_wait);
1218 break;
1219 }
1220 r = reap_events_aio(s, events, r);
1221 s->reaps += r;
1222 to_wait -= r;
256714ea
JA
1223 }
1224 } while (!s->finish);
1225
1226 free(iocbsptr);
1227 free(iocbs);
1228 free(events);
1229 finish = 1;
1230 return NULL;
1231}
1232#endif
1233
ca8c91c5
JA
1234static void io_uring_unregister_ring(struct submitter *s)
1235{
1236 struct io_uring_rsrc_update up = {
1237 .offset = s->enter_ring_fd,
1238 };
1239
1240 syscall(__NR_io_uring_register, s->ring_fd, IORING_UNREGISTER_RING_FDS,
1241 &up, 1);
1242}
1243
1244static int io_uring_register_ring(struct submitter *s)
1245{
1246 struct io_uring_rsrc_update up = {
1247 .data = s->ring_fd,
1248 .offset = -1U,
1249 };
1250 int ret;
1251
1252 ret = syscall(__NR_io_uring_register, s->ring_fd,
1253 IORING_REGISTER_RING_FDS, &up, 1);
1254 if (ret == 1) {
1255 s->enter_ring_fd = up.offset;
1256 return 0;
1257 }
1258 register_ring = 0;
1259 return -1;
1260}
1261
256714ea
JA
1262static void *submitter_uring_fn(void *data)
1263{
1264 struct submitter *s = data;
1265 struct io_sq_ring *ring = &s->sq_ring;
b65c1fc0
JA
1266 int ret, prepped;
1267#ifdef ARCH_HAVE_CPU_CLOCK
1268 int nr_batch = submitter_init(s);
1269#else
1270 submitter_init(s);
1271#endif
256714ea 1272
ca8c91c5
JA
1273 if (register_ring)
1274 io_uring_register_ring(s);
1275
c9fb4c5b
JA
1276 prepped = 0;
1277 do {
f310970e 1278 int to_wait, to_submit, this_reap, to_prep;
fc2dc21b 1279 unsigned ring_flags = 0;
c9fb4c5b 1280
e39863e3
KB
1281 if (!prepped && s->inflight < depth) {
1282 to_prep = min(depth - s->inflight, batch_submit);
256714ea 1283 prepped = prep_more_ios_uring(s, to_prep);
932131c9
JA
1284#ifdef ARCH_HAVE_CPU_CLOCK
1285 if (prepped && stats) {
1286 s->clock_batch[s->clock_index] = get_cpu_clock();
1287 s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
1288 }
1289#endif
f310970e 1290 }
c9fb4c5b
JA
1291 s->inflight += prepped;
1292submit_more:
1293 to_submit = prepped;
1294submit:
e39863e3 1295 if (to_submit && (s->inflight + to_submit <= depth))
c9fb4c5b
JA
1296 to_wait = 0;
1297 else
e39863e3 1298 to_wait = min(s->inflight + to_submit, batch_complete);
c9fb4c5b 1299
ce1705de
JA
1300 /*
1301 * Only need to call io_uring_enter if we're not using SQ thread
1302 * poll, or if IORING_SQ_NEED_WAKEUP is set.
1303 */
fc2dc21b
JA
1304 if (sq_thread_poll)
1305 ring_flags = atomic_load_acquire(ring->flags);
1306 if (!sq_thread_poll || ring_flags & IORING_SQ_NEED_WAKEUP) {
e0abe388
JA
1307 unsigned flags = 0;
1308
1309 if (to_wait)
1310 flags = IORING_ENTER_GETEVENTS;
fc2dc21b 1311 if (ring_flags & IORING_SQ_NEED_WAKEUP)
b532dd6d 1312 flags |= IORING_ENTER_SQ_WAKEUP;
e0abe388 1313 ret = io_uring_enter(s, to_submit, to_wait, flags);
ce1705de 1314 s->calls++;
fc2dc21b
JA
1315 } else {
1316 /* for SQPOLL, we submitted it all effectively */
1317 ret = to_submit;
ce1705de 1318 }
c9fb4c5b 1319
ce1705de
JA
1320 /*
1321 * For non SQ thread poll, we already got the events we needed
1322 * through the io_uring_enter() above. For SQ thread poll, we
1323 * need to loop here until we find enough events.
1324 */
1325 this_reap = 0;
1326 do {
1327 int r;
256714ea 1328
7d04588a
AG
1329 if (pt)
1330 r = reap_events_uring_pt(s);
1331 else
1332 r = reap_events_uring(s);
7d1435e6
JA
1333 if (r == -1) {
1334 s->finish = 1;
ce1705de 1335 break;
7d1435e6 1336 } else if (r > 0)
ce1705de
JA
1337 this_reap += r;
1338 } while (sq_thread_poll && this_reap < to_wait);
c9fb4c5b
JA
1339 s->reaps += this_reap;
1340
1341 if (ret >= 0) {
1342 if (!ret) {
1343 to_submit = 0;
1344 if (s->inflight)
1345 goto submit;
1346 continue;
1347 } else if (ret < to_submit) {
1348 int diff = to_submit - ret;
1349
1350 s->done += ret;
1351 prepped -= diff;
1352 goto submit_more;
1353 }
1354 s->done += ret;
1355 prepped = 0;
1356 continue;
1357 } else if (ret < 0) {
ac122fea 1358 if (errno == EAGAIN) {
c9fb4c5b
JA
1359 if (s->finish)
1360 break;
1361 if (this_reap)
1362 goto submit;
c9fb4c5b
JA
1363 to_submit = 0;
1364 goto submit;
1365 }
ac122fea 1366 printf("io_submit: %s\n", strerror(errno));
c9fb4c5b
JA
1367 break;
1368 }
1369 } while (!s->finish);
a7086591 1370
ca8c91c5
JA
1371 if (register_ring)
1372 io_uring_unregister_ring(s);
1373
c9fb4c5b
JA
1374 finish = 1;
1375 return NULL;
1376}
1377
a7648136 1378#ifdef CONFIG_PWRITEV2
379406bc
JA
1379static void *submitter_sync_fn(void *data)
1380{
1381 struct submitter *s = data;
1382 int ret;
1383
1384 submitter_init(s);
1385
1386 do {
1387 uint64_t offset;
1388 struct file *f;
1389 long r;
1390
1391 if (s->nr_files == 1) {
1392 f = &s->files[0];
1393 } else {
1394 f = &s->files[s->cur_file];
1395 if (f->pending_ios >= file_depth(s)) {
1396 s->cur_file++;
1397 if (s->cur_file == s->nr_files)
1398 s->cur_file = 0;
1399 f = &s->files[s->cur_file];
1400 }
1401 }
1402 f->pending_ios++;
1403
1404 if (random_io) {
1405 r = __rand64(&s->rand_state);
1406 offset = (r % (f->max_blocks - 1)) * bs;
1407 } else {
1408 offset = f->cur_off;
1409 f->cur_off += bs;
1410 if (f->cur_off + bs > f->max_size)
1411 f->cur_off = 0;
1412 }
1413
1414#ifdef ARCH_HAVE_CPU_CLOCK
1415 if (stats)
1416 s->clock_batch[s->clock_index] = get_cpu_clock();
1417#endif
1418
1419 s->inflight++;
1420 s->calls++;
1421
1422 if (polled)
1423 ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, RWF_HIPRI);
1424 else
1425 ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, 0);
1426
1427 if (ret < 0) {
1428 perror("preadv2");
1429 break;
1430 } else if (ret != bs) {
1431 break;
1432 }
1433
1434 s->done++;
1435 s->inflight--;
1436 f->pending_ios--;
1437 if (stats)
1438 add_stat(s, s->clock_index, 1);
1439 } while (!s->finish);
1440
1441 finish = 1;
1442 return NULL;
1443}
a7648136
JA
1444#else
1445static void *submitter_sync_fn(void *data)
1446{
1447 finish = 1;
1448 return NULL;
1449}
1450#endif
379406bc 1451
54319661
JA
1452static struct submitter *get_submitter(int offset)
1453{
1454 void *ret;
1455
1456 ret = submitter;
1457 if (offset)
1458 ret += offset * (sizeof(*submitter) + depth * sizeof(struct iovec));
1459 return ret;
1460}
1461
65e1a5e8 1462static void do_finish(const char *reason)
c9fb4c5b 1463{
54319661 1464 int j;
4b9e13dc 1465
65e1a5e8 1466 printf("Exiting on %s\n", reason);
54319661
JA
1467 for (j = 0; j < nthreads; j++) {
1468 struct submitter *s = get_submitter(j);
1469 s->finish = 1;
1470 }
4b9e13dc
JA
1471 if (max_iops > 1000000) {
1472 double miops = (double) max_iops / 1000000.0;
1473 printf("Maximum IOPS=%.2fM\n", miops);
1474 } else if (max_iops > 100000) {
1475 double kiops = (double) max_iops / 1000.0;
1476 printf("Maximum IOPS=%.2fK\n", kiops);
1477 } else {
18b557a0 1478 printf("Maximum IOPS=%lu\n", max_iops);
4b9e13dc 1479 }
c9fb4c5b
JA
1480 finish = 1;
1481}
1482
65e1a5e8
EV
1483static void sig_int(int sig)
1484{
1485 do_finish("signal");
1486}
1487
c9fb4c5b
JA
1488static void arm_sig_int(void)
1489{
1490 struct sigaction act;
1491
1492 memset(&act, 0, sizeof(act));
1493 act.sa_handler = sig_int;
1494 act.sa_flags = SA_RESTART;
1495 sigaction(SIGINT, &act, NULL);
2cf71009
BP
1496
1497 /* Windows uses SIGBREAK as a quit signal from other applications */
1498#ifdef WIN32
1499 sigaction(SIGBREAK, &act, NULL);
1500#endif
c9fb4c5b
JA
1501}
1502
d79ff8c9 1503static void usage(char *argv, int status)
e39863e3 1504{
65e1a5e8
EV
1505 char runtime_str[16];
1506 snprintf(runtime_str, sizeof(runtime_str), "%d", runtime);
e39863e3 1507 printf("%s [options] -- [filenames]\n"
35268e11
AJ
1508 " -d <int> : IO Depth, default %d\n"
1509 " -s <int> : Batch submit, default %d\n"
1510 " -c <int> : Batch complete, default %d\n"
1511 " -b <int> : Block size, default %d\n"
1512 " -p <bool> : Polled IO, default %d\n"
1513 " -B <bool> : Fixed buffers, default %d\n"
dd1f1ba0 1514 " -D <bool> : DMA map fixed buffers, default %d\n"
35268e11 1515 " -F <bool> : Register files, default %d\n"
0862f718 1516 " -n <int> : Number of threads, default %d\n"
2686fc22 1517 " -O <bool> : Use O_DIRECT, default %d\n"
4a39c524
EV
1518 " -N <bool> : Perform just no-op requests, default %d\n"
1519 " -t <bool> : Track IO latencies, default %d\n"
256714ea 1520 " -T <int> : TSC rate in HZ\n"
beda9d8d
JA
1521 " -r <int> : Runtime in seconds, default %s\n"
1522 " -R <bool> : Use random IO, default %d\n"
ca8c91c5 1523 " -a <bool> : Use legacy aio, default %d\n"
3be2f0ca 1524 " -S <bool> : Use sync IO (preadv2), default %d\n"
4b9e13dc 1525 " -X <bool> : Use registered ring %d\n"
7d04588a
AG
1526 " -P <bool> : Automatically place on device home node %d\n"
1527 " -u <bool> : Use nvme-passthrough I/O, default %d\n",
35268e11 1528 argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
a71ad043 1529 fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
ca8c91c5 1530 stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio,
7d04588a 1531 use_sync, register_ring, numa_placement, pt);
d79ff8c9 1532 exit(status);
e39863e3
KB
1533}
1534
203e4c26
JA
1535static void read_tsc_rate(void)
1536{
1537 char buffer[32];
1538 int fd, ret;
1539
1540 if (tsc_rate)
1541 return;
1542
1543 fd = open(TSC_RATE_FILE, O_RDONLY);
1544 if (fd < 0)
1545 return;
1546
1547 ret = read(fd, buffer, sizeof(buffer));
1548 if (ret < 0) {
1549 close(fd);
1550 return;
1551 }
1552
1553 tsc_rate = strtoul(buffer, NULL, 10);
1554 printf("Using TSC rate %luHz\n", tsc_rate);
1555 close(fd);
1556}
1557
1558static void write_tsc_rate(void)
1559{
1560 char buffer[32];
1561 struct stat sb;
1562 int fd, ret;
1563
1564 if (!stat(TSC_RATE_FILE, &sb))
1565 return;
1566
1567 fd = open(TSC_RATE_FILE, O_WRONLY | O_CREAT, 0644);
1568 if (fd < 0)
1569 return;
1570
1571 memset(buffer, 0, sizeof(buffer));
1572 sprintf(buffer, "%lu", tsc_rate);
1573 ret = write(fd, buffer, strlen(buffer));
1574 if (ret < 0)
1575 perror("write");
1576 close(fd);
1577}
1578
c9fb4c5b
JA
1579int main(int argc, char *argv[])
1580{
e39863e3 1581 struct submitter *s;
05138221 1582 unsigned long done, calls, reap;
4b9e13dc 1583 int i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
d79ff8c9 1584 struct file f;
c3e2fc25 1585 void *ret;
c9fb4c5b 1586
0862f718
JA
1587 if (!do_nop && argc < 2)
1588 usage(argv[0], 1);
c9fb4c5b 1589
7d04588a 1590 while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:P:u:h?")) != -1) {
e39863e3 1591 switch (opt) {
256714ea
JA
1592 case 'a':
1593 aio = !!atoi(optarg);
1594 break;
e39863e3
KB
1595 case 'd':
1596 depth = atoi(optarg);
1597 break;
1598 case 's':
1599 batch_submit = atoi(optarg);
932131c9
JA
1600 if (!batch_submit)
1601 batch_submit = 1;
e39863e3
KB
1602 break;
1603 case 'c':
1604 batch_complete = atoi(optarg);
932131c9
JA
1605 if (!batch_complete)
1606 batch_complete = 1;
e39863e3 1607 break;
5bd526f2
JA
1608 case 'b':
1609 bs = atoi(optarg);
1610 break;
1611 case 'p':
1612 polled = !!atoi(optarg);
1613 break;
6a87c3b0
JA
1614 case 'B':
1615 fixedbufs = !!atoi(optarg);
1616 break;
1617 case 'F':
1618 register_files = !!atoi(optarg);
1619 break;
54319661
JA
1620 case 'n':
1621 nthreads = atoi(optarg);
caf2b9ac
JA
1622 if (!nthreads) {
1623 printf("Threads must be non-zero\n");
1624 usage(argv[0], 1);
1625 }
54319661 1626 break;
0862f718
JA
1627 case 'N':
1628 do_nop = !!atoi(optarg);
1629 break;
2686fc22
JA
1630 case 'O':
1631 buffered = !atoi(optarg);
1632 break;
932131c9
JA
1633 case 't':
1634#ifndef ARCH_HAVE_CPU_CLOCK
1635 fprintf(stderr, "Stats not supported on this CPU\n");
1636 return 1;
1637#endif
1638 stats = !!atoi(optarg);
1639 break;
1640 case 'T':
1641#ifndef ARCH_HAVE_CPU_CLOCK
1642 fprintf(stderr, "Stats not supported on this CPU\n");
1643 return 1;
1644#endif
1645 tsc_rate = strtoul(optarg, NULL, 10);
203e4c26 1646 write_tsc_rate();
932131c9 1647 break;
65e1a5e8
EV
1648 case 'r':
1649 runtime = atoi(optarg);
1650 break;
a71ad043
JA
1651 case 'D':
1652 dma_map = !!atoi(optarg);
1653 break;
beda9d8d
JA
1654 case 'R':
1655 random_io = !!atoi(optarg);
1656 break;
ca8c91c5
JA
1657 case 'X':
1658 register_ring = !!atoi(optarg);
1659 break;
379406bc 1660 case 'S':
a7648136 1661#ifdef CONFIG_PWRITEV2
379406bc 1662 use_sync = !!atoi(optarg);
a7648136
JA
1663#else
1664 fprintf(stderr, "preadv2 not supported\n");
1665 exit(1);
1666#endif
379406bc 1667 break;
4b9e13dc
JA
1668 case 'P':
1669 numa_placement = !!atoi(optarg);
1670 break;
7d04588a
AG
1671 case 'u':
1672 pt = !!atoi(optarg);
1673 break;
e39863e3
KB
1674 case 'h':
1675 case '?':
1676 default:
d79ff8c9 1677 usage(argv[0], 0);
e39863e3
KB
1678 break;
1679 }
1680 }
1681
203e4c26
JA
1682 if (stats)
1683 read_tsc_rate();
1684
c5347611
JA
1685 if (batch_complete > depth)
1686 batch_complete = depth;
1687 if (batch_submit > depth)
1688 batch_submit = depth;
a71ad043
JA
1689 if (!fixedbufs && dma_map)
1690 dma_map = 0;
c5347611 1691
54319661 1692 submitter = calloc(nthreads, sizeof(*submitter) +
55845033 1693 roundup_pow2(depth) * sizeof(struct iovec));
54319661
JA
1694 for (j = 0; j < nthreads; j++) {
1695 s = get_submitter(j);
4b9e13dc 1696 s->numa_node = -1;
54319661
JA
1697 s->index = j;
1698 s->done = s->calls = s->reaps = 0;
1699 }
e39863e3 1700
701d1277 1701 flags = O_RDONLY | O_NOATIME;
a7086591
JA
1702 if (!buffered)
1703 flags |= O_DIRECT;
1704
54319661 1705 j = 0;
e39863e3 1706 i = optind;
d79ff8c9 1707 nfiles = argc - i;
2ac00585
JA
1708 if (!do_nop) {
1709 if (!nfiles) {
1710 printf("No files specified\n");
1711 usage(argv[0], 1);
1712 }
1713 threads_per_f = nthreads / nfiles;
1714 /* make sure each thread gets assigned files */
1715 if (threads_per_f == 0) {
1716 threads_per_f = 1;
1717 } else {
1718 threads_rem = nthreads - threads_per_f * nfiles;
1719 }
d79ff8c9 1720 }
8025517d 1721 while (!do_nop && i < argc) {
d79ff8c9
AJ
1722 int k, limit;
1723
1724 memset(&f, 0, sizeof(f));
a7086591
JA
1725
1726 fd = open(argv[i], flags);
1727 if (fd < 0) {
1728 perror("open");
1729 return 1;
1730 }
d79ff8c9
AJ
1731 f.real_fd = fd;
1732 if (get_file_size(&f)) {
a7086591
JA
1733 printf("failed getting size of device/file\n");
1734 return 1;
1735 }
d79ff8c9 1736 if (f.max_blocks <= 1) {
a7086591
JA
1737 printf("Zero file/device size?\n");
1738 return 1;
1739 }
d79ff8c9
AJ
1740 f.max_blocks--;
1741
1742 limit = threads_per_f;
1743 limit += threads_rem > 0 ? 1 : 0;
1744 for (k = 0; k < limit; k++) {
1745 s = get_submitter((j + k) % nthreads);
a7086591 1746
d79ff8c9
AJ
1747 if (s->nr_files == MAX_FDS) {
1748 printf("Max number of files (%d) reached\n", MAX_FDS);
1749 break;
1750 }
1751
1752 memcpy(&s->files[s->nr_files], &f, sizeof(f));
1753
4b9e13dc
JA
1754 if (numa_placement)
1755 detect_node(s, argv[i]);
1756
1757 s->filename = argv[i];
d79ff8c9
AJ
1758 s->nr_files++;
1759 }
1760 threads_rem--;
a7086591 1761 i++;
d79ff8c9 1762 j += limit;
a7086591
JA
1763 }
1764
c9fb4c5b
JA
1765 arm_sig_int();
1766
c409e4c2
AG
1767 t_io_uring_page_size = sysconf(_SC_PAGESIZE);
1768 if (t_io_uring_page_size < 0)
1769 t_io_uring_page_size = 4096;
a0639afe 1770
54319661
JA
1771 for (j = 0; j < nthreads; j++) {
1772 s = get_submitter(j);
379406bc
JA
1773 if (use_sync)
1774 pthread_create(&s->thread, NULL, submitter_sync_fn, s);
1775 else if (!aio)
256714ea
JA
1776 pthread_create(&s->thread, NULL, submitter_uring_fn, s);
1777#ifdef CONFIG_LIBAIO
1778 else
1779 pthread_create(&s->thread, NULL, submitter_aio_fn, s);
1780#endif
54319661 1781 }
c9fb4c5b 1782
05138221 1783 reap = calls = done = 0;
c9fb4c5b
JA
1784 do {
1785 unsigned long this_done = 0;
1786 unsigned long this_reap = 0;
1787 unsigned long this_call = 0;
1788 unsigned long rpc = 0, ipc = 0;
f3057d26 1789 unsigned long iops, bw;
c9fb4c5b
JA
1790
1791 sleep(1);
65e1a5e8
EV
1792 if (runtime && !--runtime)
1793 do_finish("timeout");
a1f17100
JA
1794
1795 /* don't print partial run, if interrupted by signal */
1796 if (finish)
1797 break;
52479d8b
JA
1798
1799 /* one second in to the run, enable stats */
1800 if (stats)
1801 stats_running = 1;
1802
54319661 1803 for (j = 0; j < nthreads; j++) {
7d1ce4b7 1804 s = get_submitter(j);
54319661
JA
1805 this_done += s->done;
1806 this_call += s->calls;
1807 this_reap += s->reaps;
1808 }
c9fb4c5b
JA
1809 if (this_call - calls) {
1810 rpc = (this_done - done) / (this_call - calls);
1811 ipc = (this_reap - reap) / (this_call - calls);
191561c1
JA
1812 } else
1813 rpc = ipc = -1;
22fd3501 1814 iops = this_done - done;
f3057d26
JA
1815 if (bs > 1048576)
1816 bw = iops * (bs / 1048576);
1817 else
1818 bw = iops / (1048576 / bs);
4b9e13dc
JA
1819 if (iops > 1000000) {
1820 double miops = (double) iops / 1000000.0;
1821 printf("IOPS=%.2fM, ", miops);
1822 } else if (iops > 100000) {
1823 double kiops = (double) iops / 1000.0;
1824 printf("IOPS=%.2fK, ", kiops);
1825 } else {
53b5fa1e 1826 printf("IOPS=%lu, ", iops);
4b9e13dc 1827 }
16d25711 1828 max_iops = max(max_iops, iops);
55037c48
JA
1829 if (!do_nop) {
1830 if (bw > 2000) {
1831 double bw_g = (double) bw / 1000.0;
1832
1833 printf("BW=%.2fGiB/s, ", bw_g);
1834 } else {
1835 printf("BW=%luMiB/s, ", bw);
1836 }
1837 }
4b9e13dc 1838 printf("IOS/call=%ld/%ld\n", rpc, ipc);
c9fb4c5b
JA
1839 done = this_done;
1840 calls = this_call;
1841 reap = this_reap;
1842 } while (!finish);
1843
54319661
JA
1844 for (j = 0; j < nthreads; j++) {
1845 s = get_submitter(j);
1846 pthread_join(s->thread, &ret);
1847 close(s->ring_fd);
932131c9
JA
1848
1849 if (stats) {
1850 unsigned long nr;
1851
1852 printf("%d: Latency percentiles:\n", s->tid);
1853 for (i = 0, nr = 0; i < PLAT_NR; i++)
1854 nr += s->plat[i];
1855 show_clat_percentiles(s->plat, nr, 4);
1856 free(s->clock_batch);
1857 free(s->plat);
1858 }
54319661 1859 }
932131c9 1860
54319661 1861 free(submitter);
c9fb4c5b
JA
1862 return 0;
1863}