#include <string.h>
#include <pthread.h>
#include <sched.h>
+#include <libgen.h>
#include "../arch/arch.h"
+#include "../os/os.h"
#include "../lib/types.h"
#include "../lib/roundup.h"
#include "../lib/rand.h"
unsigned long reaps;
unsigned long done;
unsigned long calls;
+ unsigned long io_errors;
volatile int finish;
__s32 *fds;
#endif
int numa_node;
+ int per_file_depth;
const char *filename;
struct file files[MAX_FDS];
static int bs = BS;
static int polled = 1; /* use IO polling */
static int fixedbufs = 1; /* use fixed user buffers */
-static int dma_map; /* pre-map DMA buffers */
static int register_files = 1; /* use fixed files */
static int buffered = 0; /* use buffered IO, not O_DIRECT */
static int sq_thread_poll = 0; /* use kernel submission/poller thread */
80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 };
static int plist_len = 17;
-#ifndef IORING_REGISTER_MAP_BUFFERS
-#define IORING_REGISTER_MAP_BUFFERS 22
-struct io_uring_map_buffers {
- __s32 fd;
- __u32 buf_start;
- __u32 buf_end;
- __u32 flags;
- __u64 rsvd[2];
-};
-#endif
-
static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
enum nvme_csi csi, void *data)
{
#endif
}
-static int io_uring_map_buffers(struct submitter *s)
-{
- struct io_uring_map_buffers map = {
- .fd = s->files[0].real_fd,
- .buf_end = depth,
- };
-
- if (do_nop)
- return 0;
- if (s->nr_files > 1)
- fprintf(stdout, "Mapping buffers may not work with multiple files\n");
-
- return syscall(__NR_io_uring_register, s->ring_fd,
- IORING_REGISTER_MAP_BUFFERS, &map, 1);
-}
-
static int io_uring_register_buffers(struct submitter *s)
{
if (do_nop)
static int io_uring_setup(unsigned entries, struct io_uring_params *p)
{
+ int ret;
+
/*
* Clamp CQ ring size at our SQ ring size, we don't need more entries
* than that.
p->flags |= IORING_SETUP_CQSIZE;
p->cq_entries = entries;
- return syscall(__NR_io_uring_setup, entries, p);
+ p->flags |= IORING_SETUP_COOP_TASKRUN;
+ p->flags |= IORING_SETUP_SINGLE_ISSUER;
+ p->flags |= IORING_SETUP_DEFER_TASKRUN;
+retry:
+ ret = syscall(__NR_io_uring_setup, entries, p);
+ if (!ret)
+ return 0;
+
+ if (errno == EINVAL && p->flags & IORING_SETUP_COOP_TASKRUN) {
+ p->flags &= ~IORING_SETUP_COOP_TASKRUN;
+ goto retry;
+ }
+ if (errno == EINVAL && p->flags & IORING_SETUP_SINGLE_ISSUER) {
+ p->flags &= ~IORING_SETUP_SINGLE_ISSUER;
+ goto retry;
+ }
+ if (errno == EINVAL && p->flags & IORING_SETUP_DEFER_TASKRUN) {
+ p->flags &= ~IORING_SETUP_DEFER_TASKRUN;
+ goto retry;
+ }
+
+ return ret;
}
static void io_uring_probe(int fd)
struct io_uring_probe *p;
int ret;
- p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
+ p = calloc(1, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
if (!p)
return;
- memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
ret = syscall(__NR_io_uring_register, fd, IORING_REGISTER_PROBE, p, 256);
if (ret < 0)
goto out;
#endif
}
-#ifndef CONFIG_HAVE_GETTID
-static int gettid(void)
+static unsigned long long get_offset(struct submitter *s, struct file *f)
{
- return syscall(__NR_gettid);
-}
-#endif
+ unsigned long long offset;
+ long r;
-static unsigned file_depth(struct submitter *s)
-{
- return (depth + s->nr_files - 1) / s->nr_files;
+ if (random_io) {
+ unsigned long long block;
+
+ r = __rand64(&s->rand_state);
+ block = r % f->max_blocks;
+ offset = block * (unsigned long long) bs;
+ } else {
+ offset = f->cur_off;
+ f->cur_off += bs;
+ if (f->cur_off + bs > f->max_size)
+ f->cur_off = 0;
+ }
+
+ return offset;
}
-static void init_io(struct submitter *s, unsigned index)
+static struct file *get_next_file(struct submitter *s)
{
- struct io_uring_sqe *sqe = &s->sqes[index];
- unsigned long offset;
struct file *f;
- long r;
-
- if (do_nop) {
- sqe->opcode = IORING_OP_NOP;
- return;
- }
if (s->nr_files == 1) {
f = &s->files[0];
} else {
f = &s->files[s->cur_file];
- if (f->pending_ios >= file_depth(s)) {
+ if (f->pending_ios >= s->per_file_depth) {
s->cur_file++;
if (s->cur_file == s->nr_files)
s->cur_file = 0;
f = &s->files[s->cur_file];
}
}
+
f->pending_ios++;
+ return f;
+}
- if (random_io) {
- r = __rand64(&s->rand_state);
- offset = (r % (f->max_blocks - 1)) * bs;
- } else {
- offset = f->cur_off;
- f->cur_off += bs;
- if (f->cur_off + bs > f->max_size)
- f->cur_off = 0;
+static void init_io(struct submitter *s, unsigned index)
+{
+ struct io_uring_sqe *sqe = &s->sqes[index];
+ struct file *f;
+
+ if (do_nop) {
+ sqe->opcode = IORING_OP_NOP;
+ return;
}
+ f = get_next_file(s);
+
if (register_files) {
sqe->flags = IOSQE_FIXED_FILE;
sqe->fd = f->fixed_fd;
sqe->buf_index = 0;
}
sqe->ioprio = 0;
- sqe->off = offset;
+ sqe->off = get_offset(s, f);
sqe->user_data = (unsigned long) f->fileno;
if (stats && stats_running)
sqe->user_data |= ((uint64_t)s->clock_index << 32);
struct nvme_uring_cmd *cmd;
unsigned long long slba;
unsigned long long nlb;
- long r;
- if (s->nr_files == 1) {
- f = &s->files[0];
- } else {
- f = &s->files[s->cur_file];
- if (f->pending_ios >= file_depth(s)) {
- s->cur_file++;
- if (s->cur_file == s->nr_files)
- s->cur_file = 0;
- f = &s->files[s->cur_file];
- }
- }
- f->pending_ios++;
+ f = get_next_file(s);
- if (random_io) {
- r = __rand64(&s->rand_state);
- offset = (r % (f->max_blocks - 1)) * bs;
- } else {
- offset = f->cur_off;
- f->cur_off += bs;
- if (f->cur_off + bs > f->max_size)
- f->cur_off = 0;
- }
+ offset = get_offset(s, f);
if (register_files) {
sqe->fd = f->fixed_fd;
sqe->opcode = IORING_OP_URING_CMD;
sqe->user_data = (unsigned long) f->fileno;
if (stats)
- sqe->user_data |= ((unsigned long)s->clock_index << 32);
+ sqe->user_data |= ((__u64) s->clock_index << 32ULL);
sqe->cmd_op = NVME_URING_CMD_IO;
slba = offset >> f->lba_shift;
nlb = (bs >> f->lba_shift) - 1;
cmd->cdw12 = nlb;
cmd->addr = (unsigned long) s->iovecs[index].iov_base;
cmd->data_len = bs;
+ if (fixedbufs) {
+ sqe->uring_cmd_flags = IORING_URING_CMD_FIXED;
+ sqe->buf_index = index;
+ }
cmd->nsid = f->nsid;
cmd->opcode = 2;
}
static int prep_more_ios_uring(struct submitter *s, int max_ios)
{
struct io_sq_ring *ring = &s->sq_ring;
- unsigned index, tail, next_tail, prepped = 0;
+ unsigned head, index, tail, next_tail, prepped = 0;
+
+ if (sq_thread_poll)
+ head = atomic_load_acquire(ring->head);
+ else
+ head = *ring->head;
next_tail = tail = *ring->tail;
do {
next_tail++;
- if (next_tail == atomic_load_acquire(ring->head))
+ if (next_tail == head)
break;
index = tail & sq_ring_mask;
init_io_pt(s, index);
else
init_io(s, index);
- ring->array[index] = index;
prepped++;
tail = next_tail;
} while (prepped < max_ios);
bs, lbs);
return -1;
}
- f->max_blocks = nlba / bs;
+ f->max_blocks = nlba;
f->max_size = nlba;
f->lba_shift = ilog2(lbs);
return 0;
do {
struct file *f;
- read_barrier();
if (head == atomic_load_acquire(ring->tail))
break;
cqe = &ring->cqes[head & cq_ring_mask];
f = &s->files[fileno];
f->pending_ios--;
if (cqe->res != bs) {
- printf("io: unexpected ret=%d\n", cqe->res);
- if (polled && cqe->res == -EOPNOTSUPP)
- printf("Your filesystem/driver/kernel doesn't support polled IO\n");
- return -1;
+ if (cqe->res == -ENODATA || cqe->res == -EIO) {
+ s->io_errors++;
+ } else {
+ printf("io: unexpected ret=%d\n", cqe->res);
+ if (polled && cqe->res == -EOPNOTSUPP)
+ printf("Your filesystem/driver/kernel doesn't support polled IO\n");
+ return -1;
+ }
}
}
if (stats) {
do {
struct file *f;
- read_barrier();
if (head == atomic_load_acquire(ring->tail))
break;
index = head & cq_ring_mask;
#endif
}
-static int detect_node(struct submitter *s, const char *name)
+static int detect_node(struct submitter *s, char *name)
{
#ifdef CONFIG_LIBNUMA
const char *base = basename(name);
char str[128];
int ret, fd, node;
- sprintf(str, "/sys/block/%s/device/numa_node", base);
+ if (pt)
+ sprintf(str, "/sys/class/nvme-generic/%s/device/numa_node", base);
+ else
+ sprintf(str, "/sys/block/%s/device/numa_node", base);
fd = open(str, O_RDONLY);
if (fd < 0)
return -1;
fixedbufs = register_files = 0;
}
+ s->per_file_depth = (depth + s->nr_files - 1) / s->nr_files;
return io_queue_init(roundup_pow2(depth), &s->aio_ctx);
#else
fprintf(stderr, "Legacy AIO not available on this system/build\n");
struct io_sq_ring *sring = &s->sq_ring;
struct io_cq_ring *cring = &s->cq_ring;
struct io_uring_params p;
- int ret, fd;
+ int ret, fd, i;
void *ptr;
size_t len;
perror("io_uring_register_buffers");
return 1;
}
-
- if (dma_map) {
- ret = io_uring_map_buffers(s);
- if (ret < 0) {
- perror("io_uring_map_buffers");
- return 1;
- }
- }
}
if (register_files) {
cring->ring_entries = ptr + p.cq_off.ring_entries;
cring->cqes = ptr + p.cq_off.cqes;
cq_ring_mask = *cring->ring_mask;
+
+ for (i = 0; i < p.sq_entries; i++)
+ sring->array[i] = i;
+
+ s->per_file_depth = INT_MAX;
+ if (s->nr_files)
+ s->per_file_depth = (depth + s->nr_files - 1) / s->nr_files;
return 0;
}
static int init_printed;
char buf[80];
s->tid = gettid();
- printf("submitter=%d, tid=%d, file=%s, node=%d\n", s->index, s->tid,
- s->filename, s->numa_node);
+ printf("submitter=%d, tid=%d, file=%s, nfiles=%d, node=%d\n", s->index, s->tid,
+ s->filename, s->nr_files, s->numa_node);
set_affinity(s);
buf = allocate_mem(s, bs);
if (!buf)
- return 1;
+ return -1;
s->iovecs[i].iov_base = buf;
s->iovecs[i].iov_len = bs;
}
err = 0;
} else if (!aio) {
err = setup_ring(s);
- sprintf(buf, "Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+ if (!err)
+ sprintf(buf, "Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
} else {
sprintf(buf, "Engine=aio\n");
err = setup_aio(s);
}
if (err) {
printf("queue setup failed: %s, %d\n", strerror(errno), err);
- return 1;
+ return -1;
}
if (!init_printed) {
- printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
+ printf("polled=%d, fixedbufs=%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, register_files, buffered, depth);
printf("%s", buf);
init_printed = 1;
}
static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
{
uint64_t data;
- long long offset;
struct file *f;
unsigned index;
- long r;
index = 0;
while (index < max_ios) {
struct iocb *iocb = &iocbs[index];
- if (s->nr_files == 1) {
- f = &s->files[0];
- } else {
- f = &s->files[s->cur_file];
- if (f->pending_ios >= file_depth(s)) {
- s->cur_file++;
- if (s->cur_file == s->nr_files)
- s->cur_file = 0;
- f = &s->files[s->cur_file];
- }
- }
- f->pending_ios++;
+ f = get_next_file(s);
- r = lrand48();
- offset = (r % (f->max_blocks - 1)) * bs;
io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base,
- s->iovecs[index].iov_len, offset);
+ s->iovecs[index].iov_len, get_offset(s, f));
data = f->fileno;
if (stats && stats_running)
f->pending_ios--;
if (events[reaped].res != bs) {
- printf("io: unexpected ret=%ld\n", events[reaped].res);
- return -1;
- }
- if (stats) {
+ if (events[reaped].res == -ENODATA ||
+ events[reaped].res == -EIO) {
+ s->io_errors++;
+ } else {
+ printf("io: unexpected ret=%ld\n", events[reaped].res);
+ return -1;
+ }
+ } else if (stats) {
int clock_index = data >> 32;
if (last_idx != clock_index) {
struct iocb *iocbs;
struct io_event *events;
#ifdef ARCH_HAVE_CPU_CLOCK
- int nr_batch = submitter_init(s);
-#else
- submitter_init(s);
+ int nr_batch;
+#endif
+
+ ret = submitter_init(s);
+ if (ret < 0)
+ goto done;
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+ nr_batch = ret;
#endif
iocbsptr = calloc(depth, sizeof(struct iocb *));
free(iocbsptr);
free(iocbs);
free(events);
+done:
finish = 1;
return NULL;
}
struct io_sq_ring *ring = &s->sq_ring;
int ret, prepped;
#ifdef ARCH_HAVE_CPU_CLOCK
- int nr_batch = submitter_init(s);
-#else
- submitter_init(s);
+ int nr_batch;
+#endif
+
+ ret = submitter_init(s);
+ if (ret < 0)
+ goto done;
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+ nr_batch = ret;
#endif
if (register_ring)
if (register_ring)
io_uring_unregister_ring(s);
+done:
finish = 1;
return NULL;
}
struct submitter *s = data;
int ret;
- submitter_init(s);
+ if (submitter_init(s) < 0)
+ goto done;
do {
uint64_t offset;
struct file *f;
- long r;
-
- if (s->nr_files == 1) {
- f = &s->files[0];
- } else {
- f = &s->files[s->cur_file];
- if (f->pending_ios >= file_depth(s)) {
- s->cur_file++;
- if (s->cur_file == s->nr_files)
- s->cur_file = 0;
- f = &s->files[s->cur_file];
- }
- }
- f->pending_ios++;
- if (random_io) {
- r = __rand64(&s->rand_state);
- offset = (r % (f->max_blocks - 1)) * bs;
- } else {
- offset = f->cur_off;
- f->cur_off += bs;
- if (f->cur_off + bs > f->max_size)
- f->cur_off = 0;
- }
+ f = get_next_file(s);
#ifdef ARCH_HAVE_CPU_CLOCK
if (stats)
s->inflight++;
s->calls++;
+ offset = get_offset(s, f);
if (polled)
ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, RWF_HIPRI);
else
add_stat(s, s->clock_index, 1);
} while (!s->finish);
+done:
finish = 1;
return NULL;
}
" -b <int> : Block size, default %d\n"
" -p <bool> : Polled IO, default %d\n"
" -B <bool> : Fixed buffers, default %d\n"
- " -D <bool> : DMA map fixed buffers, default %d\n"
" -F <bool> : Register files, default %d\n"
" -n <int> : Number of threads, default %d\n"
" -O <bool> : Use O_DIRECT, default %d\n"
" -P <bool> : Automatically place on device home node %d\n"
" -u <bool> : Use nvme-passthrough I/O, default %d\n",
argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
- fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
+ fixedbufs, register_files, nthreads, !buffered, do_nop,
stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio,
use_sync, register_ring, numa_placement, pt);
exit(status);
int main(int argc, char *argv[])
{
struct submitter *s;
- unsigned long done, calls, reap;
+ unsigned long done, calls, reap, io_errors;
int i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
struct file f;
void *ret;
case 'r':
runtime = atoi(optarg);
break;
- case 'D':
- dma_map = !!atoi(optarg);
- break;
case 'R':
random_io = !!atoi(optarg);
break;
batch_complete = depth;
if (batch_submit > depth)
batch_submit = depth;
- if (!fixedbufs && dma_map)
- dma_map = 0;
submitter = calloc(nthreads, sizeof(*submitter) +
roundup_pow2(depth) * sizeof(struct iovec));
s = get_submitter(j);
s->numa_node = -1;
s->index = j;
- s->done = s->calls = s->reaps = 0;
+ s->done = s->calls = s->reaps = s->io_errors = 0;
}
flags = O_RDONLY | O_NOATIME;
#endif
}
- reap = calls = done = 0;
+ reap = calls = done = io_errors = 0;
do {
unsigned long this_done = 0;
unsigned long this_reap = 0;
unsigned long this_call = 0;
+ unsigned long this_io_errors = 0;
unsigned long rpc = 0, ipc = 0;
unsigned long iops, bw;
this_done += s->done;
this_call += s->calls;
this_reap += s->reaps;
+ this_io_errors += s->io_errors;
}
if (this_call - calls) {
rpc = (this_done - done) / (this_call - calls);
} else
rpc = ipc = -1;
iops = this_done - done;
+ iops -= this_io_errors - io_errors;
if (bs > 1048576)
bw = iops * (bs / 1048576);
else
done = this_done;
calls = this_call;
reap = this_reap;
+ io_errors = this_io_errors;
} while (!finish);
for (j = 0; j < nthreads; j++) {
pthread_join(s->thread, &ret);
close(s->ring_fd);
+ if (s->io_errors)
+ printf("%d: %lu IO errors\n", s->tid, s->io_errors);
+
if (stats) {
unsigned long nr;