+ f->max_blocks = bytes / bs;
+ f->max_size = bytes;
+ return 0;
+ } else if (S_ISREG(st.st_mode)) {
+ f->max_blocks = st.st_size / bs;
+ f->max_size = st.st_size;
+ return 0;
+ }
+
+ return -1;
+}
+
+static int reap_events_uring(struct submitter *s)
+{
+ struct io_cq_ring *ring = &s->cq_ring;
+ struct io_uring_cqe *cqe;
+ unsigned head, reaped = 0;
+ int last_idx = -1, stat_nr = 0;
+
+ head = *ring->head;
+ do {
+ struct file *f;
+
+ if (head == atomic_load_acquire(ring->tail))
+ break;
+ cqe = &ring->cqes[head & cq_ring_mask];
+ if (!do_nop) {
+ int fileno = cqe->user_data & 0xffffffff;
+
+ f = &s->files[fileno];
+ f->pending_ios--;
+ if (cqe->res != bs) {
+ printf("io: unexpected ret=%d\n", cqe->res);
+ if (polled && cqe->res == -EOPNOTSUPP)
+ printf("Your filesystem/driver/kernel doesn't support polled IO\n");
+ return -1;
+ }
+ }
+ if (stats) {
+ int clock_index = cqe->user_data >> 32;
+
+ if (last_idx != clock_index) {
+ if (last_idx != -1) {
+ add_stat(s, last_idx, stat_nr);
+ stat_nr = 0;
+ }
+ last_idx = clock_index;
+ }
+ stat_nr++;
+ }
+ reaped++;
+ head++;
+ } while (1);
+
+ if (stat_nr)
+ add_stat(s, last_idx, stat_nr);
+
+ if (reaped) {
+ s->inflight -= reaped;
+ atomic_store_release(ring->head, head);
+ }
+ return reaped;
+}
+
+static int reap_events_uring_pt(struct submitter *s)
+{
+ struct io_cq_ring *ring = &s->cq_ring;
+ struct io_uring_cqe *cqe;
+ unsigned head, reaped = 0;
+ int last_idx = -1, stat_nr = 0;
+ unsigned index;
+ int fileno;
+
+ head = *ring->head;
+ do {
+ struct file *f;
+
+ if (head == atomic_load_acquire(ring->tail))
+ break;
+ index = head & cq_ring_mask;
+ cqe = &ring->cqes[index << 1];
+ fileno = cqe->user_data & 0xffffffff;
+ f = &s->files[fileno];
+ f->pending_ios--;
+
+ if (cqe->res != 0) {
+ printf("io: unexpected ret=%d\n", cqe->res);
+ if (polled && cqe->res == -EINVAL)
+ printf("passthrough doesn't support polled IO\n");
+ return -1;
+ }
+ if (stats) {
+ int clock_index = cqe->user_data >> 32;
+
+ if (last_idx != clock_index) {
+ if (last_idx != -1) {
+ add_stat(s, last_idx, stat_nr);
+ stat_nr = 0;
+ }
+ last_idx = clock_index;
+ }
+ stat_nr++;
+ }
+ reaped++;
+ head++;
+ } while (1);
+
+ if (stat_nr)
+ add_stat(s, last_idx, stat_nr);
+
+ if (reaped) {
+ s->inflight -= reaped;
+ atomic_store_release(ring->head, head);
+ }
+ return reaped;
+}
+
+static void set_affinity(struct submitter *s)
+{
+#ifdef CONFIG_LIBNUMA
+ struct bitmask *mask;
+
+ if (s->numa_node == -1)
+ return;
+
+ numa_set_preferred(s->numa_node);
+
+ mask = numa_allocate_cpumask();
+ numa_node_to_cpus(s->numa_node, mask);
+ numa_sched_setaffinity(s->tid, mask);
+#endif
+}
+
+static int detect_node(struct submitter *s, const char *name)
+{
+#ifdef CONFIG_LIBNUMA
+ const char *base = basename(name);
+ char str[128];
+ int ret, fd, node;
+
+ if (pt)
+ sprintf(str, "/sys/class/nvme-generic/%s/device/numa_node", base);
+ else
+ sprintf(str, "/sys/block/%s/device/numa_node", base);
+ fd = open(str, O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ ret = read(fd, str, sizeof(str));
+ if (ret < 0) {
+ close(fd);
+ return -1;
+ }
+ node = atoi(str);
+ s->numa_node = node;
+ close(fd);
+#else
+ s->numa_node = -1;
+#endif
+ return 0;
+}
+
+static int setup_aio(struct submitter *s)
+{
+#ifdef CONFIG_LIBAIO
+ if (polled) {
+ fprintf(stderr, "aio does not support polled IO\n");
+ polled = 0;
+ }
+ if (sq_thread_poll) {
+ fprintf(stderr, "aio does not support SQPOLL IO\n");
+ sq_thread_poll = 0;
+ }
+ if (do_nop) {
+ fprintf(stderr, "aio does not support polled IO\n");
+ do_nop = 0;
+ }
+ if (fixedbufs || register_files) {
+ fprintf(stderr, "aio does not support registered files or buffers\n");
+ fixedbufs = register_files = 0;
+ }
+
+ return io_queue_init(roundup_pow2(depth), &s->aio_ctx);
+#else
+ fprintf(stderr, "Legacy AIO not available on this system/build\n");
+ errno = EINVAL;
+ return -1;
+#endif
+}
+
+static int setup_ring(struct submitter *s)
+{
+ struct io_sq_ring *sring = &s->sq_ring;
+ struct io_cq_ring *cring = &s->cq_ring;
+ struct io_uring_params p;
+ int ret, fd, i;
+ void *ptr;
+ size_t len;
+
+ memset(&p, 0, sizeof(p));
+
+ if (polled && !do_nop)
+ p.flags |= IORING_SETUP_IOPOLL;
+ if (sq_thread_poll) {
+ p.flags |= IORING_SETUP_SQPOLL;
+ if (sq_thread_cpu != -1) {
+ p.flags |= IORING_SETUP_SQ_AFF;
+ p.sq_thread_cpu = sq_thread_cpu;
+ }
+ }
+ if (pt) {
+ p.flags |= IORING_SETUP_SQE128;
+ p.flags |= IORING_SETUP_CQE32;
+ }
+
+ fd = io_uring_setup(depth, &p);
+ if (fd < 0) {
+ perror("io_uring_setup");
+ return 1;
+ }
+ s->ring_fd = s->enter_ring_fd = fd;
+
+ io_uring_probe(fd);
+
+ if (fixedbufs) {
+ struct rlimit rlim;
+
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ /* ignore potential error, not needed on newer kernels */
+ setrlimit(RLIMIT_MEMLOCK, &rlim);
+
+ ret = io_uring_register_buffers(s);
+ if (ret < 0) {
+ perror("io_uring_register_buffers");
+ return 1;
+ }
+
+ if (dma_map) {
+ ret = io_uring_map_buffers(s);
+ if (ret < 0) {
+ perror("io_uring_map_buffers");
+ return 1;
+ }
+ }
+ }
+
+ if (register_files) {
+ ret = io_uring_register_files(s);
+ if (ret < 0) {
+ perror("io_uring_register_files");
+ return 1;
+ }
+ }
+
+ ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+ IORING_OFF_SQ_RING);
+ sring->head = ptr + p.sq_off.head;
+ sring->tail = ptr + p.sq_off.tail;
+ sring->ring_mask = ptr + p.sq_off.ring_mask;
+ sring->ring_entries = ptr + p.sq_off.ring_entries;
+ sring->flags = ptr + p.sq_off.flags;
+ sring->array = ptr + p.sq_off.array;
+ sq_ring_mask = *sring->ring_mask;
+
+ if (p.flags & IORING_SETUP_SQE128)
+ len = 2 * p.sq_entries * sizeof(struct io_uring_sqe);
+ else
+ len = p.sq_entries * sizeof(struct io_uring_sqe);
+ s->sqes = mmap(0, len,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+ IORING_OFF_SQES);
+
+ if (p.flags & IORING_SETUP_CQE32) {
+ len = p.cq_off.cqes +
+ 2 * p.cq_entries * sizeof(struct io_uring_cqe);
+ } else {
+ len = p.cq_off.cqes +
+ p.cq_entries * sizeof(struct io_uring_cqe);
+ }
+ ptr = mmap(0, len,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+ IORING_OFF_CQ_RING);
+ cring->head = ptr + p.cq_off.head;
+ cring->tail = ptr + p.cq_off.tail;
+ cring->ring_mask = ptr + p.cq_off.ring_mask;
+ cring->ring_entries = ptr + p.cq_off.ring_entries;
+ cring->cqes = ptr + p.cq_off.cqes;
+ cq_ring_mask = *cring->ring_mask;
+
+ for (i = 0; i < p.sq_entries; i++)
+ sring->array[i] = i;
+
+ return 0;
+}
+
+static void *allocate_mem(struct submitter *s, int size)
+{
+ void *buf;
+
+#ifdef CONFIG_LIBNUMA
+ if (s->numa_node != -1)
+ return numa_alloc_onnode(size, s->numa_node);
+#endif
+
+ if (posix_memalign(&buf, t_io_uring_page_size, bs)) {
+ printf("failed alloc\n");
+ return NULL;
+ }
+
+ return buf;
+}
+
+static int submitter_init(struct submitter *s)
+{
+ int i, nr_batch, err;
+ static int init_printed;
+ char buf[80];
+ s->tid = gettid();
+ printf("submitter=%d, tid=%d, file=%s, node=%d\n", s->index, s->tid,
+ s->filename, s->numa_node);
+
+ set_affinity(s);
+
+ __init_rand64(&s->rand_state, s->tid);
+ srand48(s->tid);
+
+ for (i = 0; i < MAX_FDS; i++)
+ s->files[i].fileno = i;
+
+ for (i = 0; i < roundup_pow2(depth); i++) {
+ void *buf;
+
+ buf = allocate_mem(s, bs);
+ if (!buf)
+ return 1;
+ s->iovecs[i].iov_base = buf;
+ s->iovecs[i].iov_len = bs;
+ }
+
+ if (use_sync) {
+ sprintf(buf, "Engine=preadv2\n");
+ err = 0;
+ } else if (!aio) {
+ err = setup_ring(s);
+ sprintf(buf, "Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+ } else {
+ sprintf(buf, "Engine=aio\n");
+ err = setup_aio(s);
+ }
+ if (err) {
+ printf("queue setup failed: %s, %d\n", strerror(errno), err);
+ return 1;
+ }
+
+ if (!init_printed) {
+ printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
+ printf("%s", buf);
+ init_printed = 1;
+ }
+
+ if (stats) {
+ nr_batch = roundup_pow2(depth / batch_submit);
+ if (nr_batch < 2)
+ nr_batch = 2;
+ s->clock_batch = calloc(nr_batch, sizeof(unsigned long));
+ s->clock_index = 1;
+
+ s->plat = calloc(PLAT_NR, sizeof(unsigned long));
+ } else {
+ s->clock_batch = NULL;
+ s->plat = NULL;
+ nr_batch = 0;
+ }
+ /* perform the expensive command initialization part for passthrough here
+ * rather than in the fast path
+ */
+ if (pt) {
+ for (i = 0; i < roundup_pow2(depth); i++) {
+ struct io_uring_sqe *sqe = &s->sqes[i << 1];
+
+ memset(&sqe->cmd, 0, sizeof(struct nvme_uring_cmd));
+ }
+ }
+ return nr_batch;
+}
+
+#ifdef CONFIG_LIBAIO
+static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
+{
+ uint64_t data;
+ struct file *f;
+ unsigned index;
+
+ index = 0;
+ while (index < max_ios) {
+ struct iocb *iocb = &iocbs[index];
+
+ if (s->nr_files == 1) {
+ f = &s->files[0];
+ } else {
+ f = &s->files[s->cur_file];
+ if (f->pending_ios >= file_depth(s)) {
+ s->cur_file++;
+ if (s->cur_file == s->nr_files)
+ s->cur_file = 0;
+ f = &s->files[s->cur_file];
+ }
+ }
+ f->pending_ios++;
+
+ io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base,
+ s->iovecs[index].iov_len, get_offset(s, f));
+
+ data = f->fileno;
+ if (stats && stats_running)
+ data |= (((uint64_t) s->clock_index) << 32);
+ iocb->data = (void *) (uintptr_t) data;
+ index++;
+ }
+ return index;
+}
+
+static int reap_events_aio(struct submitter *s, struct io_event *events, int evs)
+{
+ int last_idx = -1, stat_nr = 0;
+ int reaped = 0;
+
+ while (evs) {
+ uint64_t data = (uintptr_t) events[reaped].data;
+ struct file *f = &s->files[data & 0xffffffff];
+
+ f->pending_ios--;
+ if (events[reaped].res != bs) {
+ printf("io: unexpected ret=%ld\n", events[reaped].res);
+ return -1;
+ }
+ if (stats) {
+ int clock_index = data >> 32;
+
+ if (last_idx != clock_index) {
+ if (last_idx != -1) {
+ add_stat(s, last_idx, stat_nr);
+ stat_nr = 0;
+ }
+ last_idx = clock_index;
+ }
+ stat_nr++;
+ }
+ reaped++;
+ evs--;
+ }
+
+ if (stat_nr)
+ add_stat(s, last_idx, stat_nr);
+
+ s->inflight -= reaped;
+ s->done += reaped;
+ return reaped;
+}
+
+static void *submitter_aio_fn(void *data)
+{
+ struct submitter *s = data;
+ int i, ret, prepped;
+ struct iocb **iocbsptr;
+ struct iocb *iocbs;
+ struct io_event *events;
+#ifdef ARCH_HAVE_CPU_CLOCK
+ int nr_batch = submitter_init(s);
+#else
+ submitter_init(s);
+#endif
+
+ iocbsptr = calloc(depth, sizeof(struct iocb *));
+ iocbs = calloc(depth, sizeof(struct iocb));
+ events = calloc(depth, sizeof(struct io_event));
+
+ for (i = 0; i < depth; i++)
+ iocbsptr[i] = &iocbs[i];
+
+ prepped = 0;
+ do {
+ int to_wait, to_submit, to_prep;
+
+ if (!prepped && s->inflight < depth) {
+ to_prep = min(depth - s->inflight, batch_submit);
+ prepped = prep_more_ios_aio(s, to_prep, iocbs);
+#ifdef ARCH_HAVE_CPU_CLOCK
+ if (prepped && stats) {
+ s->clock_batch[s->clock_index] = get_cpu_clock();
+ s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
+ }
+#endif
+ }
+ s->inflight += prepped;
+ to_submit = prepped;
+
+ if (to_submit && (s->inflight + to_submit <= depth))
+ to_wait = 0;
+ else
+ to_wait = min(s->inflight + to_submit, batch_complete);
+
+ ret = io_submit(s->aio_ctx, to_submit, iocbsptr);
+ s->calls++;
+ if (ret < 0) {
+ perror("io_submit");
+ break;
+ } else if (ret != to_submit) {
+ printf("submitted %d, wanted %d\n", ret, to_submit);
+ break;
+ }
+ prepped = 0;
+
+ while (to_wait) {
+ int r;