+ if (stat_nr)
+ add_stat(s, last_idx, stat_nr);
+
+ if (reaped) {
+ s->inflight -= reaped;
+ atomic_store_release(ring->head, head);
+ }
+ return reaped;
+}
+
+static int reap_events_uring_pt(struct submitter *s)
+{
+ struct io_cq_ring *ring = &s->cq_ring;
+ struct io_uring_cqe *cqe;
+ unsigned head, reaped = 0;
+ int last_idx = -1, stat_nr = 0;
+ unsigned index;
+ int fileno;
+
+ head = *ring->head;
+ do {
+ struct file *f;
+
+ if (head == atomic_load_acquire(ring->tail))
+ break;
+ index = head & cq_ring_mask;
+ cqe = &ring->cqes[index << 1];
+ fileno = cqe->user_data & 0xffffffff;
+ f = &s->files[fileno];
+ f->pending_ios--;
+
+ if (cqe->res != 0) {
+ printf("io: unexpected ret=%d\n", cqe->res);
+ if (polled && cqe->res == -EINVAL)
+ printf("passthrough doesn't support polled IO\n");
+ return -1;
+ }
+ if (stats) {
+ int clock_index = cqe->user_data >> 32;
+
+ if (last_idx != clock_index) {
+ if (last_idx != -1) {
+ add_stat(s, last_idx, stat_nr);
+ stat_nr = 0;
+ }
+ last_idx = clock_index;
+ }
+ stat_nr++;
+ }
+ reaped++;
+ head++;
+ } while (1);
+
+ if (stat_nr)
+ add_stat(s, last_idx, stat_nr);
+
+ if (reaped) {
+ s->inflight -= reaped;
+ atomic_store_release(ring->head, head);
+ }
+ return reaped;
+}
+
+static void set_affinity(struct submitter *s)
+{
+#ifdef CONFIG_LIBNUMA
+ struct bitmask *mask;
+
+ if (s->numa_node == -1)
+ return;
+
+ numa_set_preferred(s->numa_node);
+
+ mask = numa_allocate_cpumask();
+ numa_node_to_cpus(s->numa_node, mask);
+ numa_sched_setaffinity(s->tid, mask);
+#endif
+}
+
+static int detect_node(struct submitter *s, const char *name)
+{
+#ifdef CONFIG_LIBNUMA
+ const char *base = basename(name);
+ char str[128];
+ int ret, fd, node;
+
+ if (pt)
+ sprintf(str, "/sys/class/nvme-generic/%s/device/numa_node", base);
+ else
+ sprintf(str, "/sys/block/%s/device/numa_node", base);
+ fd = open(str, O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ ret = read(fd, str, sizeof(str));
+ if (ret < 0) {
+ close(fd);
+ return -1;
+ }
+ node = atoi(str);
+ s->numa_node = node;
+ close(fd);
+#else
+ s->numa_node = -1;
+#endif
+ return 0;
+}
+
+static int setup_aio(struct submitter *s)
+{
+#ifdef CONFIG_LIBAIO
+ if (polled) {
+ fprintf(stderr, "aio does not support polled IO\n");
+ polled = 0;
+ }
+ if (sq_thread_poll) {
+ fprintf(stderr, "aio does not support SQPOLL IO\n");
+ sq_thread_poll = 0;
+ }
+ if (do_nop) {
+ fprintf(stderr, "aio does not support polled IO\n");
+ do_nop = 0;
+ }
+ if (fixedbufs || register_files) {
+ fprintf(stderr, "aio does not support registered files or buffers\n");
+ fixedbufs = register_files = 0;
+ }
+
+ return io_queue_init(roundup_pow2(depth), &s->aio_ctx);
+#else
+ fprintf(stderr, "Legacy AIO not available on this system/build\n");
+ errno = EINVAL;
+ return -1;
+#endif
+}
+
+static int setup_ring(struct submitter *s)
+{
+ struct io_sq_ring *sring = &s->sq_ring;
+ struct io_cq_ring *cring = &s->cq_ring;
+ struct io_uring_params p;
+ int ret, fd, i;
+ void *ptr;
+ size_t len;
+
+ memset(&p, 0, sizeof(p));
+
+ if (polled && !do_nop)
+ p.flags |= IORING_SETUP_IOPOLL;
+ if (sq_thread_poll) {
+ p.flags |= IORING_SETUP_SQPOLL;
+ if (sq_thread_cpu != -1) {
+ p.flags |= IORING_SETUP_SQ_AFF;
+ p.sq_thread_cpu = sq_thread_cpu;
+ }
+ }
+ if (pt) {
+ p.flags |= IORING_SETUP_SQE128;
+ p.flags |= IORING_SETUP_CQE32;
+ }
+
+ fd = io_uring_setup(depth, &p);
+ if (fd < 0) {
+ perror("io_uring_setup");
+ return 1;
+ }
+ s->ring_fd = s->enter_ring_fd = fd;
+
+ io_uring_probe(fd);
+
+ if (fixedbufs) {
+ struct rlimit rlim;
+
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ /* ignore potential error, not needed on newer kernels */
+ setrlimit(RLIMIT_MEMLOCK, &rlim);
+
+ ret = io_uring_register_buffers(s);
+ if (ret < 0) {
+ perror("io_uring_register_buffers");
+ return 1;
+ }
+
+ if (dma_map) {
+ ret = io_uring_map_buffers(s);
+ if (ret < 0) {
+ perror("io_uring_map_buffers");
+ return 1;
+ }
+ }
+ }
+
+ if (register_files) {
+ ret = io_uring_register_files(s);
+ if (ret < 0) {
+ perror("io_uring_register_files");
+ return 1;
+ }
+ }
+
+ ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+ IORING_OFF_SQ_RING);
+ sring->head = ptr + p.sq_off.head;
+ sring->tail = ptr + p.sq_off.tail;
+ sring->ring_mask = ptr + p.sq_off.ring_mask;
+ sring->ring_entries = ptr + p.sq_off.ring_entries;
+ sring->flags = ptr + p.sq_off.flags;
+ sring->array = ptr + p.sq_off.array;
+ sq_ring_mask = *sring->ring_mask;
+
+ if (p.flags & IORING_SETUP_SQE128)
+ len = 2 * p.sq_entries * sizeof(struct io_uring_sqe);
+ else
+ len = p.sq_entries * sizeof(struct io_uring_sqe);
+ s->sqes = mmap(0, len,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+ IORING_OFF_SQES);
+
+ if (p.flags & IORING_SETUP_CQE32) {
+ len = p.cq_off.cqes +
+ 2 * p.cq_entries * sizeof(struct io_uring_cqe);
+ } else {
+ len = p.cq_off.cqes +
+ p.cq_entries * sizeof(struct io_uring_cqe);
+ }
+ ptr = mmap(0, len,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+ IORING_OFF_CQ_RING);
+ cring->head = ptr + p.cq_off.head;
+ cring->tail = ptr + p.cq_off.tail;
+ cring->ring_mask = ptr + p.cq_off.ring_mask;
+ cring->ring_entries = ptr + p.cq_off.ring_entries;
+ cring->cqes = ptr + p.cq_off.cqes;
+ cq_ring_mask = *cring->ring_mask;
+
+ for (i = 0; i < p.sq_entries; i++)
+ sring->array[i] = i;
+
+ return 0;
+}