X-Git-Url: https://git.kernel.dk/?a=blobdiff_plain;f=t%2Fio_uring.c;h=1ea0a9da29b6444ab66d9794940677c5e87f2475;hb=7d7a704638a1e957c845c04eeac82bdeda0c674c;hp=3130e4695179f25dfd037cb229858e1ca57d70e9;hpb=4f2152278e0b3c35ded02fb3e6fb550eab7bedcd;p=fio.git diff --git a/t/io_uring.c b/t/io_uring.c index 3130e469..1ea0a9da 100644 --- a/t/io_uring.c +++ b/t/io_uring.c @@ -5,6 +5,15 @@ #include #include #include +#include + +#ifdef CONFIG_LIBAIO +#include +#endif + +#ifdef CONFIG_LIBNUMA +#include +#endif #include #include @@ -21,10 +30,13 @@ #include #include "../arch/arch.h" +#include "../os/os.h" #include "../lib/types.h" +#include "../lib/roundup.h" +#include "../lib/rand.h" +#include "../minmax.h" #include "../os/linux/io_uring.h" - -#define min(a, b) ((a < b) ? (a) : (b)) +#include "../engines/nvme.h" struct io_sq_ring { unsigned *head; @@ -54,19 +66,31 @@ static unsigned sq_ring_mask, cq_ring_mask; struct file { unsigned long max_blocks; + unsigned long max_size; + unsigned long cur_off; unsigned pending_ios; + unsigned int nsid; /* nsid field required for nvme-passthrough */ + unsigned int lba_shift; /* lba_shift field required for nvme-passthrough */ int real_fd; int fixed_fd; + int fileno; }; +#define PLAT_BITS 6 +#define PLAT_VAL (1 << PLAT_BITS) +#define PLAT_GROUP_NR 29 +#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL) + struct submitter { pthread_t thread; int ring_fd; + int enter_ring_fd; int index; struct io_sq_ring sq_ring; struct io_uring_sqe *sqes; struct io_cq_ring cq_ring; int inflight; + int tid; unsigned long reaps; unsigned long done; unsigned long calls; @@ -74,6 +98,19 @@ struct submitter { __s32 *fds; + struct taus258_state rand_state; + + unsigned long *clock_batch; + int clock_index; + unsigned long *plat; + +#ifdef CONFIG_LIBAIO + io_context_t aio_ctx; +#endif + + int numa_node; + const char *filename; + struct file files[MAX_FDS]; unsigned nr_files; unsigned cur_file; @@ -82,6 +119,9 @@ struct submitter { static struct submitter *submitter; static volatile int finish; +static int stats_running; +static unsigned long max_iops; +static long t_io_uring_page_size; static int depth = DEPTH; static int batch_submit = BATCH_SUBMIT; @@ -89,22 +129,305 @@ static int batch_complete = BATCH_COMPLETE; static int bs = BS; static int polled = 1; /* use IO polling */ static int fixedbufs = 1; /* use fixed user buffers */ +static int dma_map; /* pre-map DMA buffers */ static int register_files = 1; /* use fixed files */ static int buffered = 0; /* use buffered IO, not O_DIRECT */ static int sq_thread_poll = 0; /* use kernel submission/poller thread */ static int sq_thread_cpu = -1; /* pin above thread to this CPU */ static int do_nop = 0; /* no-op SQ ring commands */ static int nthreads = 1; +static int stats = 0; /* generate IO stats */ +static int aio = 0; /* use libaio */ +static int runtime = 0; /* runtime */ +static int random_io = 1; /* random or sequential IO */ +static int register_ring = 1; /* register ring */ +static int use_sync = 0; /* use preadv2 */ +static int numa_placement = 0; /* set to node of device */ +static int pt = 0; /* passthrough I/O or not */ + +static unsigned long tsc_rate; + +#define TSC_RATE_FILE "tsc-rate" static int vectored = 1; +static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, + 80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 }; +static int plist_len = 17; + +#ifndef IORING_REGISTER_MAP_BUFFERS +#define IORING_REGISTER_MAP_BUFFERS 26 +struct io_uring_map_buffers { + __s32 fd; + __u32 buf_start; + __u32 buf_end; + __u32 flags; + __u64 rsvd[2]; +}; +#endif + +static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns, + enum nvme_csi csi, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_admin_identify, + .nsid = nsid, + .addr = (__u64)(uintptr_t)data, + .data_len = NVME_IDENTIFY_DATA_SIZE, + .cdw10 = cns, + .cdw11 = csi << NVME_IDENTIFY_CSI_SHIFT, + .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT, + }; + + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); +} + +static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba) +{ + struct nvme_id_ns ns; + int namespace_id; + int err; + + namespace_id = ioctl(fd, NVME_IOCTL_ID); + if (namespace_id < 0) { + fprintf(stderr, "error failed to fetch namespace-id\n"); + close(fd); + return -errno; + } + + /* + * Identify namespace to get namespace-id, namespace size in LBA's + * and LBA data size. + */ + err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS, + NVME_CSI_NVM, &ns); + if (err) { + fprintf(stderr, "error failed to fetch identify namespace\n"); + close(fd); + return err; + } + + *nsid = namespace_id; + *lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds; + *nlba = ns.nsze; + + return 0; +} + +static unsigned long cycles_to_nsec(unsigned long cycles) +{ + uint64_t val; + + if (!tsc_rate) + return cycles; + + val = cycles * 1000000000ULL; + return val / tsc_rate; +} + +static unsigned long plat_idx_to_val(unsigned int idx) +{ + unsigned int error_bits; + unsigned long k, base; + + assert(idx < PLAT_NR); + + /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use + * all bits of the sample as index */ + if (idx < (PLAT_VAL << 1)) + return cycles_to_nsec(idx); + + /* Find the group and compute the minimum value of that group */ + error_bits = (idx >> PLAT_BITS) - 1; + base = ((unsigned long) 1) << (error_bits + PLAT_BITS); + + /* Find its bucket number of the group */ + k = idx % PLAT_VAL; + + /* Return the mean of the range of the bucket */ + return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits))); +} + +unsigned int calculate_clat_percentiles(unsigned long *io_u_plat, + unsigned long nr, unsigned long **output, + unsigned long *maxv, unsigned long *minv) +{ + unsigned long sum = 0; + unsigned int len = plist_len, i, j = 0; + unsigned long *ovals = NULL; + bool is_last; + + *minv = -1UL; + *maxv = 0; + + ovals = malloc(len * sizeof(*ovals)); + if (!ovals) + return 0; + + /* + * Calculate bucket values, note down max and min values + */ + is_last = false; + for (i = 0; i < PLAT_NR && !is_last; i++) { + sum += io_u_plat[i]; + while (sum >= ((long double) plist[j] / 100.0 * nr)) { + assert(plist[j] <= 100.0); + + ovals[j] = plat_idx_to_val(i); + if (ovals[j] < *minv) + *minv = ovals[j]; + if (ovals[j] > *maxv) + *maxv = ovals[j]; + + is_last = (j == len - 1) != 0; + if (is_last) + break; + + j++; + } + } + + if (!is_last) + fprintf(stderr, "error calculating latency percentiles\n"); + + *output = ovals; + return len; +} + +static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr, + unsigned int precision) +{ + unsigned int divisor, len, i, j = 0; + unsigned long minv, maxv; + unsigned long *ovals; + int per_line, scale_down, time_width; + bool is_last; + char fmt[32]; + + len = calculate_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv); + if (!len || !ovals) + goto out; + + if (!tsc_rate) { + scale_down = 0; + divisor = 1; + printf(" percentiles (tsc ticks):\n |"); + } else if (minv > 2000 && maxv > 99999) { + scale_down = 1; + divisor = 1000; + printf(" percentiles (usec):\n |"); + } else { + scale_down = 0; + divisor = 1; + printf(" percentiles (nsec):\n |"); + } + + time_width = max(5, (int) (log10(maxv / divisor) + 1)); + snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3, + precision, time_width); + /* fmt will be something like " %5.2fth=[%4llu]%c" */ + per_line = (80 - 7) / (precision + 10 + time_width); + + for (j = 0; j < len; j++) { + /* for formatting */ + if (j != 0 && (j % per_line) == 0) + printf(" |"); + + /* end of the list */ + is_last = (j == len - 1) != 0; + + for (i = 0; i < scale_down; i++) + ovals[j] = (ovals[j] + 999) / 1000; + + printf(fmt, plist[j], ovals[j], is_last ? '\n' : ','); + + if (is_last) + break; + + if ((j % per_line) == per_line - 1) /* for formatting */ + printf("\n"); + } + +out: + free(ovals); +} + +#ifdef ARCH_HAVE_CPU_CLOCK +static unsigned int plat_val_to_idx(unsigned long val) +{ + unsigned int msb, error_bits, base, offset, idx; + + /* Find MSB starting from bit 0 */ + if (val == 0) + msb = 0; + else + msb = (sizeof(val)*8) - __builtin_clzll(val) - 1; + + /* + * MSB <= (PLAT_BITS-1), cannot be rounded off. Use + * all bits of the sample as index + */ + if (msb <= PLAT_BITS) + return val; + + /* Compute the number of error bits to discard*/ + error_bits = msb - PLAT_BITS; + + /* Compute the number of buckets before the group */ + base = (error_bits + 1) << PLAT_BITS; + + /* + * Discard the error bits and apply the mask to find the + * index for the buckets in the group + */ + offset = (PLAT_VAL - 1) & (val >> error_bits); + + /* Make sure the index does not exceed (array size - 1) */ + idx = (base + offset) < (PLAT_NR - 1) ? + (base + offset) : (PLAT_NR - 1); + + return idx; +} +#endif + +static void add_stat(struct submitter *s, int clock_index, int nr) +{ +#ifdef ARCH_HAVE_CPU_CLOCK + unsigned long cycles; + unsigned int pidx; + + if (!s->finish && clock_index) { + cycles = get_cpu_clock(); + cycles -= s->clock_batch[clock_index]; + pidx = plat_val_to_idx(cycles); + s->plat[pidx] += nr; + } +#endif +} + +static int io_uring_map_buffers(struct submitter *s) +{ + struct io_uring_map_buffers map = { + .fd = s->files[0].real_fd, + .buf_end = depth, + }; + + if (do_nop) + return 0; + if (s->nr_files > 1) + fprintf(stdout, "Mapping buffers may not work with multiple files\n"); + + return syscall(__NR_io_uring_register, s->ring_fd, + IORING_REGISTER_MAP_BUFFERS, &map, 1); +} + static int io_uring_register_buffers(struct submitter *s) { if (do_nop) return 0; return syscall(__NR_io_uring_register, s->ring_fd, - IORING_REGISTER_BUFFERS, s->iovecs, depth); + IORING_REGISTER_BUFFERS, s->iovecs, roundup_pow2(depth)); } static int io_uring_register_files(struct submitter *s) @@ -126,7 +449,37 @@ static int io_uring_register_files(struct submitter *s) static int io_uring_setup(unsigned entries, struct io_uring_params *p) { - return syscall(__NR_io_uring_setup, entries, p); + int ret; + + /* + * Clamp CQ ring size at our SQ ring size, we don't need more entries + * than that. + */ + p->flags |= IORING_SETUP_CQSIZE; + p->cq_entries = entries; + + p->flags |= IORING_SETUP_COOP_TASKRUN; + p->flags |= IORING_SETUP_SINGLE_ISSUER; + p->flags |= IORING_SETUP_DEFER_TASKRUN; +retry: + ret = syscall(__NR_io_uring_setup, entries, p); + if (!ret) + return 0; + + if (errno == EINVAL && p->flags & IORING_SETUP_COOP_TASKRUN) { + p->flags &= ~IORING_SETUP_COOP_TASKRUN; + goto retry; + } + if (errno == EINVAL && p->flags & IORING_SETUP_SINGLE_ISSUER) { + p->flags &= ~IORING_SETUP_SINGLE_ISSUER; + goto retry; + } + if (errno == EINVAL && p->flags & IORING_SETUP_DEFER_TASKRUN) { + p->flags &= ~IORING_SETUP_DEFER_TASKRUN; + goto retry; + } + + return ret; } static void io_uring_probe(int fd) @@ -155,28 +508,44 @@ out: static int io_uring_enter(struct submitter *s, unsigned int to_submit, unsigned int min_complete, unsigned int flags) { - return syscall(__NR_io_uring_enter, s->ring_fd, to_submit, min_complete, - flags, NULL, 0); + if (register_ring) + flags |= IORING_ENTER_REGISTERED_RING; +#ifdef FIO_ARCH_HAS_SYSCALL + return __do_syscall6(__NR_io_uring_enter, s->enter_ring_fd, to_submit, + min_complete, flags, NULL, 0); +#else + return syscall(__NR_io_uring_enter, s->enter_ring_fd, to_submit, + min_complete, flags, NULL, 0); +#endif } -#ifndef CONFIG_HAVE_GETTID -static int gettid(void) +static unsigned file_depth(struct submitter *s) { - return syscall(__NR_gettid); + return (depth + s->nr_files - 1) / s->nr_files; } -#endif -static unsigned file_depth(struct submitter *s) +static unsigned long long get_offset(struct submitter *s, struct file *f) { - return (depth + s->nr_files - 1) / s->nr_files; + unsigned long long offset; + long r; + + if (random_io) { + r = __rand64(&s->rand_state); + offset = (r % (f->max_blocks - 1)) * bs; + } else { + offset = f->cur_off; + f->cur_off += bs; + if (f->cur_off + bs > f->max_size) + f->cur_off = 0; + } + + return offset; } static void init_io(struct submitter *s, unsigned index) { struct io_uring_sqe *sqe = &s->sqes[index]; - unsigned long offset; struct file *f; - long r; if (do_nop) { sqe->opcode = IORING_OP_NOP; @@ -196,9 +565,6 @@ static void init_io(struct submitter *s, unsigned index) } f->pending_ios++; - r = lrand48(); - offset = (r % (f->max_blocks - 1)) * bs; - if (register_files) { sqe->flags = IOSQE_FIXED_FILE; sqe->fd = f->fixed_fd; @@ -223,24 +589,96 @@ static void init_io(struct submitter *s, unsigned index) sqe->buf_index = 0; } sqe->ioprio = 0; - sqe->off = offset; - sqe->user_data = (unsigned long) f; + sqe->off = get_offset(s, f); + sqe->user_data = (unsigned long) f->fileno; + if (stats && stats_running) + sqe->user_data |= ((uint64_t)s->clock_index << 32); +} + +static void init_io_pt(struct submitter *s, unsigned index) +{ + struct io_uring_sqe *sqe = &s->sqes[index << 1]; + unsigned long offset; + struct file *f; + struct nvme_uring_cmd *cmd; + unsigned long long slba; + unsigned long long nlb; + long r; + + if (s->nr_files == 1) { + f = &s->files[0]; + } else { + f = &s->files[s->cur_file]; + if (f->pending_ios >= file_depth(s)) { + s->cur_file++; + if (s->cur_file == s->nr_files) + s->cur_file = 0; + f = &s->files[s->cur_file]; + } + } + f->pending_ios++; + + if (random_io) { + r = __rand64(&s->rand_state); + offset = (r % (f->max_blocks - 1)) * bs; + } else { + offset = f->cur_off; + f->cur_off += bs; + if (f->cur_off + bs > f->max_size) + f->cur_off = 0; + } + + if (register_files) { + sqe->fd = f->fixed_fd; + sqe->flags = IOSQE_FIXED_FILE; + } else { + sqe->fd = f->real_fd; + sqe->flags = 0; + } + sqe->opcode = IORING_OP_URING_CMD; + sqe->user_data = (unsigned long) f->fileno; + if (stats) + sqe->user_data |= ((__u64) s->clock_index << 32ULL); + sqe->cmd_op = NVME_URING_CMD_IO; + slba = offset >> f->lba_shift; + nlb = (bs >> f->lba_shift) - 1; + cmd = (struct nvme_uring_cmd *)&sqe->cmd; + /* cdw10 and cdw11 represent starting slba*/ + cmd->cdw10 = slba & 0xffffffff; + cmd->cdw11 = slba >> 32; + /* cdw12 represent number of lba to be read*/ + cmd->cdw12 = nlb; + cmd->addr = (unsigned long) s->iovecs[index].iov_base; + cmd->data_len = bs; + if (fixedbufs) { + sqe->uring_cmd_flags = IORING_URING_CMD_FIXED; + sqe->buf_index = index; + } + cmd->nsid = f->nsid; + cmd->opcode = 2; } -static int prep_more_ios(struct submitter *s, int max_ios) +static int prep_more_ios_uring(struct submitter *s, int max_ios) { struct io_sq_ring *ring = &s->sq_ring; - unsigned index, tail, next_tail, prepped = 0; + unsigned head, index, tail, next_tail, prepped = 0; + + if (sq_thread_poll) + head = atomic_load_acquire(ring->head); + else + head = *ring->head; next_tail = tail = *ring->tail; do { next_tail++; - if (next_tail == atomic_load_acquire(ring->head)) + if (next_tail == head) break; index = tail & sq_ring_mask; - init_io(s, index); - ring->array[index] = index; + if (pt) + init_io_pt(s, index); + else + init_io(s, index); prepped++; tail = next_tail; } while (prepped < max_ios); @@ -256,38 +694,64 @@ static int get_file_size(struct file *f) if (fstat(f->real_fd, &st) < 0) return -1; - if (S_ISBLK(st.st_mode)) { + if (pt) { + __u64 nlba; + __u32 lbs; + int ret; + + if (!S_ISCHR(st.st_mode)) { + fprintf(stderr, "passthrough works with only nvme-ns " + "generic devices (/dev/ngXnY)\n"); + return -1; + } + ret = nvme_get_info(f->real_fd, &f->nsid, &lbs, &nlba); + if (ret) + return -1; + if ((bs % lbs) != 0) { + printf("error: bs:%d should be a multiple logical_block_size:%d\n", + bs, lbs); + return -1; + } + f->max_blocks = nlba / bs; + f->max_size = nlba; + f->lba_shift = ilog2(lbs); + return 0; + } else if (S_ISBLK(st.st_mode)) { unsigned long long bytes; if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0) return -1; f->max_blocks = bytes / bs; + f->max_size = bytes; return 0; } else if (S_ISREG(st.st_mode)) { f->max_blocks = st.st_size / bs; + f->max_size = st.st_size; return 0; } return -1; } -static int reap_events(struct submitter *s) +static int reap_events_uring(struct submitter *s) { struct io_cq_ring *ring = &s->cq_ring; struct io_uring_cqe *cqe; unsigned head, reaped = 0; + int last_idx = -1, stat_nr = 0; head = *ring->head; do { struct file *f; - read_barrier(); if (head == atomic_load_acquire(ring->tail)) break; cqe = &ring->cqes[head & cq_ring_mask]; if (!do_nop) { - f = (struct file *) (uintptr_t) cqe->user_data; + int fileno = cqe->user_data & 0xffffffff; + + f = &s->files[fileno]; f->pending_ios--; if (cqe->res != bs) { printf("io: unexpected ret=%d\n", cqe->res); @@ -296,10 +760,25 @@ static int reap_events(struct submitter *s) return -1; } } + if (stats) { + int clock_index = cqe->user_data >> 32; + + if (last_idx != clock_index) { + if (last_idx != -1) { + add_stat(s, last_idx, stat_nr); + stat_nr = 0; + } + last_idx = clock_index; + } + stat_nr++; + } reaped++; head++; } while (1); + if (stat_nr) + add_stat(s, last_idx, stat_nr); + if (reaped) { s->inflight -= reaped; atomic_store_release(ring->head, head); @@ -307,135 +786,130 @@ static int reap_events(struct submitter *s) return reaped; } -static void *submitter_fn(void *data) +static int reap_events_uring_pt(struct submitter *s) { - struct submitter *s = data; - struct io_sq_ring *ring = &s->sq_ring; - int ret, prepped; - - printf("submitter=%d\n", gettid()); - - srand48(pthread_self()); + struct io_cq_ring *ring = &s->cq_ring; + struct io_uring_cqe *cqe; + unsigned head, reaped = 0; + int last_idx = -1, stat_nr = 0; + unsigned index; + int fileno; - prepped = 0; + head = *ring->head; do { - int to_wait, to_submit, this_reap, to_prep; - unsigned ring_flags = 0; + struct file *f; - if (!prepped && s->inflight < depth) { - to_prep = min(depth - s->inflight, batch_submit); - prepped = prep_more_ios(s, to_prep); + if (head == atomic_load_acquire(ring->tail)) + break; + index = head & cq_ring_mask; + cqe = &ring->cqes[index << 1]; + fileno = cqe->user_data & 0xffffffff; + f = &s->files[fileno]; + f->pending_ios--; + + if (cqe->res != 0) { + printf("io: unexpected ret=%d\n", cqe->res); + if (polled && cqe->res == -EINVAL) + printf("passthrough doesn't support polled IO\n"); + return -1; } - s->inflight += prepped; -submit_more: - to_submit = prepped; -submit: - if (to_submit && (s->inflight + to_submit <= depth)) - to_wait = 0; - else - to_wait = min(s->inflight + to_submit, batch_complete); - - /* - * Only need to call io_uring_enter if we're not using SQ thread - * poll, or if IORING_SQ_NEED_WAKEUP is set. - */ - if (sq_thread_poll) - ring_flags = atomic_load_acquire(ring->flags); - if (!sq_thread_poll || ring_flags & IORING_SQ_NEED_WAKEUP) { - unsigned flags = 0; - - if (to_wait) - flags = IORING_ENTER_GETEVENTS; - if (ring_flags & IORING_SQ_NEED_WAKEUP) - flags |= IORING_ENTER_SQ_WAKEUP; - ret = io_uring_enter(s, to_submit, to_wait, flags); - s->calls++; - } else { - /* for SQPOLL, we submitted it all effectively */ - ret = to_submit; + if (stats) { + int clock_index = cqe->user_data >> 32; + + if (last_idx != clock_index) { + if (last_idx != -1) { + add_stat(s, last_idx, stat_nr); + stat_nr = 0; + } + last_idx = clock_index; + } + stat_nr++; } + reaped++; + head++; + } while (1); - /* - * For non SQ thread poll, we already got the events we needed - * through the io_uring_enter() above. For SQ thread poll, we - * need to loop here until we find enough events. - */ - this_reap = 0; - do { - int r; - r = reap_events(s); - if (r == -1) { - s->finish = 1; - break; - } else if (r > 0) - this_reap += r; - } while (sq_thread_poll && this_reap < to_wait); - s->reaps += this_reap; + if (stat_nr) + add_stat(s, last_idx, stat_nr); - if (ret >= 0) { - if (!ret) { - to_submit = 0; - if (s->inflight) - goto submit; - continue; - } else if (ret < to_submit) { - int diff = to_submit - ret; + if (reaped) { + s->inflight -= reaped; + atomic_store_release(ring->head, head); + } + return reaped; +} - s->done += ret; - prepped -= diff; - goto submit_more; - } - s->done += ret; - prepped = 0; - continue; - } else if (ret < 0) { - if (errno == EAGAIN) { - if (s->finish) - break; - if (this_reap) - goto submit; - to_submit = 0; - goto submit; - } - printf("io_submit: %s\n", strerror(errno)); - break; - } - } while (!s->finish); +static void set_affinity(struct submitter *s) +{ +#ifdef CONFIG_LIBNUMA + struct bitmask *mask; - finish = 1; - return NULL; -} + if (s->numa_node == -1) + return; -static struct submitter *get_submitter(int offset) -{ - void *ret; + numa_set_preferred(s->numa_node); - ret = submitter; - if (offset) - ret += offset * (sizeof(*submitter) + depth * sizeof(struct iovec)); - return ret; + mask = numa_allocate_cpumask(); + numa_node_to_cpus(s->numa_node, mask); + numa_sched_setaffinity(s->tid, mask); +#endif } -static void sig_int(int sig) +static int detect_node(struct submitter *s, const char *name) { - int j; +#ifdef CONFIG_LIBNUMA + const char *base = basename(name); + char str[128]; + int ret, fd, node; + + if (pt) + sprintf(str, "/sys/class/nvme-generic/%s/device/numa_node", base); + else + sprintf(str, "/sys/block/%s/device/numa_node", base); + fd = open(str, O_RDONLY); + if (fd < 0) + return -1; - printf("Exiting on signal %d\n", sig); - for (j = 0; j < nthreads; j++) { - struct submitter *s = get_submitter(j); - s->finish = 1; + ret = read(fd, str, sizeof(str)); + if (ret < 0) { + close(fd); + return -1; } - finish = 1; + node = atoi(str); + s->numa_node = node; + close(fd); +#else + s->numa_node = -1; +#endif + return 0; } -static void arm_sig_int(void) +static int setup_aio(struct submitter *s) { - struct sigaction act; +#ifdef CONFIG_LIBAIO + if (polled) { + fprintf(stderr, "aio does not support polled IO\n"); + polled = 0; + } + if (sq_thread_poll) { + fprintf(stderr, "aio does not support SQPOLL IO\n"); + sq_thread_poll = 0; + } + if (do_nop) { + fprintf(stderr, "aio does not support polled IO\n"); + do_nop = 0; + } + if (fixedbufs || register_files) { + fprintf(stderr, "aio does not support registered files or buffers\n"); + fixedbufs = register_files = 0; + } - memset(&act, 0, sizeof(act)); - act.sa_handler = sig_int; - act.sa_flags = SA_RESTART; - sigaction(SIGINT, &act, NULL); + return io_queue_init(roundup_pow2(depth), &s->aio_ctx); +#else + fprintf(stderr, "Legacy AIO not available on this system/build\n"); + errno = EINVAL; + return -1; +#endif } static int setup_ring(struct submitter *s) @@ -443,8 +917,9 @@ static int setup_ring(struct submitter *s) struct io_sq_ring *sring = &s->sq_ring; struct io_cq_ring *cring = &s->cq_ring; struct io_uring_params p; - int ret, fd; + int ret, fd, i; void *ptr; + size_t len; memset(&p, 0, sizeof(p)); @@ -457,22 +932,41 @@ static int setup_ring(struct submitter *s) p.sq_thread_cpu = sq_thread_cpu; } } + if (pt) { + p.flags |= IORING_SETUP_SQE128; + p.flags |= IORING_SETUP_CQE32; + } fd = io_uring_setup(depth, &p); if (fd < 0) { perror("io_uring_setup"); return 1; } - s->ring_fd = fd; + s->ring_fd = s->enter_ring_fd = fd; io_uring_probe(fd); if (fixedbufs) { + struct rlimit rlim; + + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + /* ignore potential error, not needed on newer kernels */ + setrlimit(RLIMIT_MEMLOCK, &rlim); + ret = io_uring_register_buffers(s); if (ret < 0) { perror("io_uring_register_buffers"); return 1; } + + if (dma_map) { + ret = io_uring_map_buffers(s); + if (ret < 0) { + perror("io_uring_map_buffers"); + return 1; + } + } } if (register_files) { @@ -486,7 +980,6 @@ static int setup_ring(struct submitter *s) ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); - printf("sq_ring ptr = 0x%p\n", ptr); sring->head = ptr + p.sq_off.head; sring->tail = ptr + p.sq_off.tail; sring->ring_mask = ptr + p.sq_off.ring_mask; @@ -495,82 +988,644 @@ static int setup_ring(struct submitter *s) sring->array = ptr + p.sq_off.array; sq_ring_mask = *sring->ring_mask; - s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), + if (p.flags & IORING_SETUP_SQE128) + len = 2 * p.sq_entries * sizeof(struct io_uring_sqe); + else + len = p.sq_entries * sizeof(struct io_uring_sqe); + s->sqes = mmap(0, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); - printf("sqes ptr = 0x%p\n", s->sqes); - ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe), + if (p.flags & IORING_SETUP_CQE32) { + len = p.cq_off.cqes + + 2 * p.cq_entries * sizeof(struct io_uring_cqe); + } else { + len = p.cq_off.cqes + + p.cq_entries * sizeof(struct io_uring_cqe); + } + ptr = mmap(0, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); - printf("cq_ring ptr = 0x%p\n", ptr); cring->head = ptr + p.cq_off.head; cring->tail = ptr + p.cq_off.tail; cring->ring_mask = ptr + p.cq_off.ring_mask; cring->ring_entries = ptr + p.cq_off.ring_entries; cring->cqes = ptr + p.cq_off.cqes; cq_ring_mask = *cring->ring_mask; + + for (i = 0; i < p.sq_entries; i++) + sring->array[i] = i; + return 0; } -static void file_depths(char *buf) +static void *allocate_mem(struct submitter *s, int size) { - bool prev = false; - char *p; - int i, j; + void *buf; - buf[0] = '\0'; - p = buf; - for (j = 0; j < nthreads; j++) { - struct submitter *s = get_submitter(j); +#ifdef CONFIG_LIBNUMA + if (s->numa_node != -1) + return numa_alloc_onnode(size, s->numa_node); +#endif + + if (posix_memalign(&buf, t_io_uring_page_size, bs)) { + printf("failed alloc\n"); + return NULL; + } + + return buf; +} + +static int submitter_init(struct submitter *s) +{ + int i, nr_batch, err; + static int init_printed; + char buf[80]; + s->tid = gettid(); + printf("submitter=%d, tid=%d, file=%s, node=%d\n", s->index, s->tid, + s->filename, s->numa_node); + + set_affinity(s); + + __init_rand64(&s->rand_state, s->tid); + srand48(s->tid); + + for (i = 0; i < MAX_FDS; i++) + s->files[i].fileno = i; + + for (i = 0; i < roundup_pow2(depth); i++) { + void *buf; + + buf = allocate_mem(s, bs); + if (!buf) + return 1; + s->iovecs[i].iov_base = buf; + s->iovecs[i].iov_len = bs; + } + + if (use_sync) { + sprintf(buf, "Engine=preadv2\n"); + err = 0; + } else if (!aio) { + err = setup_ring(s); + sprintf(buf, "Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries); + } else { + sprintf(buf, "Engine=aio\n"); + err = setup_aio(s); + } + if (err) { + printf("queue setup failed: %s, %d\n", strerror(errno), err); + return 1; + } + + if (!init_printed) { + printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth); + printf("%s", buf); + init_printed = 1; + } + + if (stats) { + nr_batch = roundup_pow2(depth / batch_submit); + if (nr_batch < 2) + nr_batch = 2; + s->clock_batch = calloc(nr_batch, sizeof(unsigned long)); + s->clock_index = 1; + + s->plat = calloc(PLAT_NR, sizeof(unsigned long)); + } else { + s->clock_batch = NULL; + s->plat = NULL; + nr_batch = 0; + } + /* perform the expensive command initialization part for passthrough here + * rather than in the fast path + */ + if (pt) { + for (i = 0; i < roundup_pow2(depth); i++) { + struct io_uring_sqe *sqe = &s->sqes[i << 1]; + + memset(&sqe->cmd, 0, sizeof(struct nvme_uring_cmd)); + } + } + return nr_batch; +} + +#ifdef CONFIG_LIBAIO +static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs) +{ + uint64_t data; + struct file *f; + unsigned index; + + index = 0; + while (index < max_ios) { + struct iocb *iocb = &iocbs[index]; + + if (s->nr_files == 1) { + f = &s->files[0]; + } else { + f = &s->files[s->cur_file]; + if (f->pending_ios >= file_depth(s)) { + s->cur_file++; + if (s->cur_file == s->nr_files) + s->cur_file = 0; + f = &s->files[s->cur_file]; + } + } + f->pending_ios++; + + io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base, + s->iovecs[index].iov_len, get_offset(s, f)); + + data = f->fileno; + if (stats && stats_running) + data |= (((uint64_t) s->clock_index) << 32); + iocb->data = (void *) (uintptr_t) data; + index++; + } + return index; +} + +static int reap_events_aio(struct submitter *s, struct io_event *events, int evs) +{ + int last_idx = -1, stat_nr = 0; + int reaped = 0; + + while (evs) { + uint64_t data = (uintptr_t) events[reaped].data; + struct file *f = &s->files[data & 0xffffffff]; + + f->pending_ios--; + if (events[reaped].res != bs) { + printf("io: unexpected ret=%ld\n", events[reaped].res); + return -1; + } + if (stats) { + int clock_index = data >> 32; + + if (last_idx != clock_index) { + if (last_idx != -1) { + add_stat(s, last_idx, stat_nr); + stat_nr = 0; + } + last_idx = clock_index; + } + stat_nr++; + } + reaped++; + evs--; + } + + if (stat_nr) + add_stat(s, last_idx, stat_nr); - for (i = 0; i < s->nr_files; i++) { - struct file *f = &s->files[i]; + s->inflight -= reaped; + s->done += reaped; + return reaped; +} + +static void *submitter_aio_fn(void *data) +{ + struct submitter *s = data; + int i, ret, prepped; + struct iocb **iocbsptr; + struct iocb *iocbs; + struct io_event *events; +#ifdef ARCH_HAVE_CPU_CLOCK + int nr_batch = submitter_init(s); +#else + submitter_init(s); +#endif + + iocbsptr = calloc(depth, sizeof(struct iocb *)); + iocbs = calloc(depth, sizeof(struct iocb)); + events = calloc(depth, sizeof(struct io_event)); + + for (i = 0; i < depth; i++) + iocbsptr[i] = &iocbs[i]; + + prepped = 0; + do { + int to_wait, to_submit, to_prep; + + if (!prepped && s->inflight < depth) { + to_prep = min(depth - s->inflight, batch_submit); + prepped = prep_more_ios_aio(s, to_prep, iocbs); +#ifdef ARCH_HAVE_CPU_CLOCK + if (prepped && stats) { + s->clock_batch[s->clock_index] = get_cpu_clock(); + s->clock_index = (s->clock_index + 1) & (nr_batch - 1); + } +#endif + } + s->inflight += prepped; + to_submit = prepped; + + if (to_submit && (s->inflight + to_submit <= depth)) + to_wait = 0; + else + to_wait = min(s->inflight + to_submit, batch_complete); + + ret = io_submit(s->aio_ctx, to_submit, iocbsptr); + s->calls++; + if (ret < 0) { + perror("io_submit"); + break; + } else if (ret != to_submit) { + printf("submitted %d, wanted %d\n", ret, to_submit); + break; + } + prepped = 0; + + while (to_wait) { + int r; + + s->calls++; + r = io_getevents(s->aio_ctx, to_wait, to_wait, events, NULL); + if (r < 0) { + perror("io_getevents"); + break; + } else if (r != to_wait) { + printf("r=%d, wait=%d\n", r, to_wait); + break; + } + r = reap_events_aio(s, events, r); + s->reaps += r; + to_wait -= r; + } + } while (!s->finish); + + free(iocbsptr); + free(iocbs); + free(events); + finish = 1; + return NULL; +} +#endif + +static void io_uring_unregister_ring(struct submitter *s) +{ + struct io_uring_rsrc_update up = { + .offset = s->enter_ring_fd, + }; + + syscall(__NR_io_uring_register, s->ring_fd, IORING_UNREGISTER_RING_FDS, + &up, 1); +} + +static int io_uring_register_ring(struct submitter *s) +{ + struct io_uring_rsrc_update up = { + .data = s->ring_fd, + .offset = -1U, + }; + int ret; + + ret = syscall(__NR_io_uring_register, s->ring_fd, + IORING_REGISTER_RING_FDS, &up, 1); + if (ret == 1) { + s->enter_ring_fd = up.offset; + return 0; + } + register_ring = 0; + return -1; +} + +static void *submitter_uring_fn(void *data) +{ + struct submitter *s = data; + struct io_sq_ring *ring = &s->sq_ring; + int ret, prepped; +#ifdef ARCH_HAVE_CPU_CLOCK + int nr_batch = submitter_init(s); +#else + submitter_init(s); +#endif + + if (register_ring) + io_uring_register_ring(s); + + prepped = 0; + do { + int to_wait, to_submit, this_reap, to_prep; + unsigned ring_flags = 0; + + if (!prepped && s->inflight < depth) { + to_prep = min(depth - s->inflight, batch_submit); + prepped = prep_more_ios_uring(s, to_prep); +#ifdef ARCH_HAVE_CPU_CLOCK + if (prepped && stats) { + s->clock_batch[s->clock_index] = get_cpu_clock(); + s->clock_index = (s->clock_index + 1) & (nr_batch - 1); + } +#endif + } + s->inflight += prepped; +submit_more: + to_submit = prepped; +submit: + if (to_submit && (s->inflight + to_submit <= depth)) + to_wait = 0; + else + to_wait = min(s->inflight + to_submit, batch_complete); + + /* + * Only need to call io_uring_enter if we're not using SQ thread + * poll, or if IORING_SQ_NEED_WAKEUP is set. + */ + if (sq_thread_poll) + ring_flags = atomic_load_acquire(ring->flags); + if (!sq_thread_poll || ring_flags & IORING_SQ_NEED_WAKEUP) { + unsigned flags = 0; + + if (to_wait) + flags = IORING_ENTER_GETEVENTS; + if (ring_flags & IORING_SQ_NEED_WAKEUP) + flags |= IORING_ENTER_SQ_WAKEUP; + ret = io_uring_enter(s, to_submit, to_wait, flags); + s->calls++; + } else { + /* for SQPOLL, we submitted it all effectively */ + ret = to_submit; + } + + /* + * For non SQ thread poll, we already got the events we needed + * through the io_uring_enter() above. For SQ thread poll, we + * need to loop here until we find enough events. + */ + this_reap = 0; + do { + int r; - if (prev) - p += sprintf(p, " %d", f->pending_ios); + if (pt) + r = reap_events_uring_pt(s); else - p += sprintf(p, "%d", f->pending_ios); - prev = true; + r = reap_events_uring(s); + if (r == -1) { + s->finish = 1; + break; + } else if (r > 0) + this_reap += r; + } while (sq_thread_poll && this_reap < to_wait); + s->reaps += this_reap; + + if (ret >= 0) { + if (!ret) { + to_submit = 0; + if (s->inflight) + goto submit; + continue; + } else if (ret < to_submit) { + int diff = to_submit - ret; + + s->done += ret; + prepped -= diff; + goto submit_more; + } + s->done += ret; + prepped = 0; + continue; + } else if (ret < 0) { + if (errno == EAGAIN) { + if (s->finish) + break; + if (this_reap) + goto submit; + to_submit = 0; + goto submit; + } + printf("io_submit: %s\n", strerror(errno)); + break; } + } while (!s->finish); + + if (register_ring) + io_uring_unregister_ring(s); + + finish = 1; + return NULL; +} + +#ifdef CONFIG_PWRITEV2 +static void *submitter_sync_fn(void *data) +{ + struct submitter *s = data; + int ret; + + submitter_init(s); + + do { + uint64_t offset; + struct file *f; + + if (s->nr_files == 1) { + f = &s->files[0]; + } else { + f = &s->files[s->cur_file]; + if (f->pending_ios >= file_depth(s)) { + s->cur_file++; + if (s->cur_file == s->nr_files) + s->cur_file = 0; + f = &s->files[s->cur_file]; + } + } + f->pending_ios++; + +#ifdef ARCH_HAVE_CPU_CLOCK + if (stats) + s->clock_batch[s->clock_index] = get_cpu_clock(); +#endif + + s->inflight++; + s->calls++; + + offset = get_offset(s, f); + if (polled) + ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, RWF_HIPRI); + else + ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, 0); + + if (ret < 0) { + perror("preadv2"); + break; + } else if (ret != bs) { + break; + } + + s->done++; + s->inflight--; + f->pending_ios--; + if (stats) + add_stat(s, s->clock_index, 1); + } while (!s->finish); + + finish = 1; + return NULL; +} +#else +static void *submitter_sync_fn(void *data) +{ + finish = 1; + return NULL; +} +#endif + +static struct submitter *get_submitter(int offset) +{ + void *ret; + + ret = submitter; + if (offset) + ret += offset * (sizeof(*submitter) + depth * sizeof(struct iovec)); + return ret; +} + +static void do_finish(const char *reason) +{ + int j; + + printf("Exiting on %s\n", reason); + for (j = 0; j < nthreads; j++) { + struct submitter *s = get_submitter(j); + s->finish = 1; + } + if (max_iops > 1000000) { + double miops = (double) max_iops / 1000000.0; + printf("Maximum IOPS=%.2fM\n", miops); + } else if (max_iops > 100000) { + double kiops = (double) max_iops / 1000.0; + printf("Maximum IOPS=%.2fK\n", kiops); + } else { + printf("Maximum IOPS=%lu\n", max_iops); } + finish = 1; +} + +static void sig_int(int sig) +{ + do_finish("signal"); +} + +static void arm_sig_int(void) +{ + struct sigaction act; + + memset(&act, 0, sizeof(act)); + act.sa_handler = sig_int; + act.sa_flags = SA_RESTART; + sigaction(SIGINT, &act, NULL); + + /* Windows uses SIGBREAK as a quit signal from other applications */ +#ifdef WIN32 + sigaction(SIGBREAK, &act, NULL); +#endif } -static void usage(char *argv) +static void usage(char *argv, int status) { + char runtime_str[16]; + snprintf(runtime_str, sizeof(runtime_str), "%d", runtime); printf("%s [options] -- [filenames]\n" - " -d : IO Depth, default %d\n" - " -s : Batch submit, default %d\n" - " -c : Batch complete, default %d\n" - " -b : Block size, default %d\n" - " -p : Polled IO, default %d\n", - argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled); - exit(0); + " -d : IO Depth, default %d\n" + " -s : Batch submit, default %d\n" + " -c : Batch complete, default %d\n" + " -b : Block size, default %d\n" + " -p : Polled IO, default %d\n" + " -B : Fixed buffers, default %d\n" + " -D : DMA map fixed buffers, default %d\n" + " -F : Register files, default %d\n" + " -n : Number of threads, default %d\n" + " -O : Use O_DIRECT, default %d\n" + " -N : Perform just no-op requests, default %d\n" + " -t : Track IO latencies, default %d\n" + " -T : TSC rate in HZ\n" + " -r : Runtime in seconds, default %s\n" + " -R : Use random IO, default %d\n" + " -a : Use legacy aio, default %d\n" + " -S : Use sync IO (preadv2), default %d\n" + " -X : Use registered ring %d\n" + " -P : Automatically place on device home node %d\n" + " -u : Use nvme-passthrough I/O, default %d\n", + argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled, + fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop, + stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio, + use_sync, register_ring, numa_placement, pt); + exit(status); +} + +static void read_tsc_rate(void) +{ + char buffer[32]; + int fd, ret; + + if (tsc_rate) + return; + + fd = open(TSC_RATE_FILE, O_RDONLY); + if (fd < 0) + return; + + ret = read(fd, buffer, sizeof(buffer)); + if (ret < 0) { + close(fd); + return; + } + + tsc_rate = strtoul(buffer, NULL, 10); + printf("Using TSC rate %luHz\n", tsc_rate); + close(fd); +} + +static void write_tsc_rate(void) +{ + char buffer[32]; + struct stat sb; + int fd, ret; + + if (!stat(TSC_RATE_FILE, &sb)) + return; + + fd = open(TSC_RATE_FILE, O_WRONLY | O_CREAT, 0644); + if (fd < 0) + return; + + memset(buffer, 0, sizeof(buffer)); + sprintf(buffer, "%lu", tsc_rate); + ret = write(fd, buffer, strlen(buffer)); + if (ret < 0) + perror("write"); + close(fd); } int main(int argc, char *argv[]) { struct submitter *s; unsigned long done, calls, reap; - int err, i, j, flags, fd, opt; - char *fdepths; + int i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles; + struct file f; void *ret; - if (!do_nop && argc < 2) { - printf("%s: filename [options]\n", argv[0]); - return 1; - } + if (!do_nop && argc < 2) + usage(argv[0], 1); - while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:h?")) != -1) { + while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:P:u:h?")) != -1) { switch (opt) { + case 'a': + aio = !!atoi(optarg); + break; case 'd': depth = atoi(optarg); break; case 's': batch_submit = atoi(optarg); + if (!batch_submit) + batch_submit = 1; break; case 'c': batch_complete = atoi(optarg); + if (!batch_complete) + batch_complete = 1; break; case 'b': bs = atoi(optarg); @@ -586,19 +1641,81 @@ int main(int argc, char *argv[]) break; case 'n': nthreads = atoi(optarg); + if (!nthreads) { + printf("Threads must be non-zero\n"); + usage(argv[0], 1); + } + break; + case 'N': + do_nop = !!atoi(optarg); + break; + case 'O': + buffered = !atoi(optarg); + break; + case 't': +#ifndef ARCH_HAVE_CPU_CLOCK + fprintf(stderr, "Stats not supported on this CPU\n"); + return 1; +#endif + stats = !!atoi(optarg); + break; + case 'T': +#ifndef ARCH_HAVE_CPU_CLOCK + fprintf(stderr, "Stats not supported on this CPU\n"); + return 1; +#endif + tsc_rate = strtoul(optarg, NULL, 10); + write_tsc_rate(); + break; + case 'r': + runtime = atoi(optarg); + break; + case 'D': + dma_map = !!atoi(optarg); + break; + case 'R': + random_io = !!atoi(optarg); + break; + case 'X': + register_ring = !!atoi(optarg); + break; + case 'S': +#ifdef CONFIG_PWRITEV2 + use_sync = !!atoi(optarg); +#else + fprintf(stderr, "preadv2 not supported\n"); + exit(1); +#endif + break; + case 'P': + numa_placement = !!atoi(optarg); + break; + case 'u': + pt = !!atoi(optarg); break; case 'h': case '?': default: - usage(argv[0]); + usage(argv[0], 0); break; } } + if (stats) + read_tsc_rate(); + + if (batch_complete > depth) + batch_complete = depth; + if (batch_submit > depth) + batch_submit = depth; + if (!fixedbufs && dma_map) + dma_map = 0; + submitter = calloc(nthreads, sizeof(*submitter) + - depth * sizeof(struct iovec)); + roundup_pow2(depth) * sizeof(struct iovec)); for (j = 0; j < nthreads; j++) { s = get_submitter(j); + s->numa_node = -1; s->index = j; s->done = s->calls = s->reaps = 0; } @@ -609,95 +1726,104 @@ int main(int argc, char *argv[]) j = 0; i = optind; - printf("i %d, argc %d\n", i, argc); + nfiles = argc - i; + if (!do_nop) { + if (!nfiles) { + printf("No files specified\n"); + usage(argv[0], 1); + } + threads_per_f = nthreads / nfiles; + /* make sure each thread gets assigned files */ + if (threads_per_f == 0) { + threads_per_f = 1; + } else { + threads_rem = nthreads - threads_per_f * nfiles; + } + } while (!do_nop && i < argc) { - struct file *f; + int k, limit; + + memset(&f, 0, sizeof(f)); - s = get_submitter(j); - if (s->nr_files == MAX_FDS) { - printf("Max number of files (%d) reached\n", MAX_FDS); - break; - } fd = open(argv[i], flags); if (fd < 0) { perror("open"); return 1; } - - f = &s->files[s->nr_files]; - f->real_fd = fd; - if (get_file_size(f)) { + f.real_fd = fd; + if (get_file_size(&f)) { printf("failed getting size of device/file\n"); return 1; } - if (f->max_blocks <= 1) { + if (f.max_blocks <= 1) { printf("Zero file/device size?\n"); return 1; } - f->max_blocks--; - - printf("Added file %s (submitter %d)\n", argv[i], s->index); - s->nr_files++; - i++; - if (++j >= nthreads) - j = 0; - } + f.max_blocks--; - if (fixedbufs) { - struct rlimit rlim; + limit = threads_per_f; + limit += threads_rem > 0 ? 1 : 0; + for (k = 0; k < limit; k++) { + s = get_submitter((j + k) % nthreads); - rlim.rlim_cur = RLIM_INFINITY; - rlim.rlim_max = RLIM_INFINITY; - if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { - perror("setrlimit"); - return 1; - } - } + if (s->nr_files == MAX_FDS) { + printf("Max number of files (%d) reached\n", MAX_FDS); + break; + } - arm_sig_int(); + memcpy(&s->files[s->nr_files], &f, sizeof(f)); - for (j = 0; j < nthreads; j++) { - s = get_submitter(j); - for (i = 0; i < depth; i++) { - void *buf; + if (numa_placement) + detect_node(s, argv[i]); - if (posix_memalign(&buf, bs, bs)) { - printf("failed alloc\n"); - return 1; - } - s->iovecs[i].iov_base = buf; - s->iovecs[i].iov_len = bs; + s->filename = argv[i]; + s->nr_files++; } + threads_rem--; + i++; + j += limit; } - for (j = 0; j < nthreads; j++) { - s = get_submitter(j); + arm_sig_int(); - err = setup_ring(s); - if (err) { - printf("ring setup failed: %s, %d\n", strerror(errno), err); - return 1; - } - } - s = get_submitter(0); - printf("polled=%d, fixedbufs=%d, register_files=%d, buffered=%d", polled, fixedbufs, register_files, buffered); - printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", depth, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries); + t_io_uring_page_size = sysconf(_SC_PAGESIZE); + if (t_io_uring_page_size < 0) + t_io_uring_page_size = 4096; for (j = 0; j < nthreads; j++) { s = get_submitter(j); - pthread_create(&s->thread, NULL, submitter_fn, s); + if (use_sync) + pthread_create(&s->thread, NULL, submitter_sync_fn, s); + else if (!aio) + pthread_create(&s->thread, NULL, submitter_uring_fn, s); +#ifdef CONFIG_LIBAIO + else + pthread_create(&s->thread, NULL, submitter_aio_fn, s); +#endif } - fdepths = malloc(8 * s->nr_files * nthreads); reap = calls = done = 0; do { unsigned long this_done = 0; unsigned long this_reap = 0; unsigned long this_call = 0; unsigned long rpc = 0, ipc = 0; + unsigned long iops, bw; sleep(1); + if (runtime && !--runtime) + do_finish("timeout"); + + /* don't print partial run, if interrupted by signal */ + if (finish) + break; + + /* one second in to the run, enable stats */ + if (stats) + stats_running = 1; + for (j = 0; j < nthreads; j++) { + s = get_submitter(j); this_done += s->done; this_call += s->calls; this_reap += s->reaps; @@ -707,9 +1833,31 @@ int main(int argc, char *argv[]) ipc = (this_reap - reap) / (this_call - calls); } else rpc = ipc = -1; - file_depths(fdepths); - printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=(%s)\n", - this_done - done, rpc, ipc, fdepths); + iops = this_done - done; + if (bs > 1048576) + bw = iops * (bs / 1048576); + else + bw = iops / (1048576 / bs); + if (iops > 1000000) { + double miops = (double) iops / 1000000.0; + printf("IOPS=%.2fM, ", miops); + } else if (iops > 100000) { + double kiops = (double) iops / 1000.0; + printf("IOPS=%.2fK, ", kiops); + } else { + printf("IOPS=%lu, ", iops); + } + max_iops = max(max_iops, iops); + if (!do_nop) { + if (bw > 2000) { + double bw_g = (double) bw / 1000.0; + + printf("BW=%.2fGiB/s, ", bw_g); + } else { + printf("BW=%luMiB/s, ", bw); + } + } + printf("IOS/call=%ld/%ld\n", rpc, ipc); done = this_done; calls = this_call; reap = this_reap; @@ -719,8 +1867,19 @@ int main(int argc, char *argv[]) s = get_submitter(j); pthread_join(s->thread, &ret); close(s->ring_fd); + + if (stats) { + unsigned long nr; + + printf("%d: Latency percentiles:\n", s->tid); + for (i = 0, nr = 0; i < PLAT_NR; i++) + nr += s->plat[i]; + show_clat_percentiles(s->plat, nr, 4); + free(s->clock_batch); + free(s->plat); + } } - free(fdepths); + free(submitter); return 0; }