Commit | Line | Data |
---|---|---|
21b4aa5d JA |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Simple benchmark program that uses the various features of io_uring | |
4 | * to provide fast random access to a device/file. It has various | |
5 | * options that are control how we use io_uring, see the OPTIONS section | |
6 | * below. This uses the raw io_uring interface. | |
7 | * | |
8 | * Copyright (C) 2018-2019 Jens Axboe | |
9 | */ | |
10 | #include <stdio.h> | |
11 | #include <errno.h> | |
12 | #include <assert.h> | |
13 | #include <stdlib.h> | |
14 | #include <stddef.h> | |
15 | #include <signal.h> | |
16 | #include <inttypes.h> | |
17 | ||
18 | #include <sys/types.h> | |
19 | #include <sys/stat.h> | |
20 | #include <sys/ioctl.h> | |
21 | #include <sys/syscall.h> | |
22 | #include <sys/resource.h> | |
23 | #include <sys/mman.h> | |
24 | #include <sys/uio.h> | |
25 | #include <linux/fs.h> | |
26 | #include <fcntl.h> | |
27 | #include <unistd.h> | |
28 | #include <string.h> | |
29 | #include <pthread.h> | |
30 | #include <sched.h> | |
31 | ||
32 | #include "liburing.h" | |
33 | #include "barrier.h" | |
34 | ||
21b4aa5d JA |
35 | #define min(a, b) ((a < b) ? (a) : (b)) |
36 | ||
37 | struct io_sq_ring { | |
38 | unsigned *head; | |
39 | unsigned *tail; | |
40 | unsigned *ring_mask; | |
41 | unsigned *ring_entries; | |
42 | unsigned *flags; | |
43 | unsigned *array; | |
44 | }; | |
45 | ||
46 | struct io_cq_ring { | |
47 | unsigned *head; | |
48 | unsigned *tail; | |
49 | unsigned *ring_mask; | |
50 | unsigned *ring_entries; | |
51 | struct io_uring_cqe *cqes; | |
52 | }; | |
53 | ||
54 | #define DEPTH 128 | |
55 | ||
56 | #define BATCH_SUBMIT 32 | |
57 | #define BATCH_COMPLETE 32 | |
58 | ||
59 | #define BS 4096 | |
60 | ||
61 | #define MAX_FDS 16 | |
62 | ||
63 | static unsigned sq_ring_mask, cq_ring_mask; | |
64 | ||
65 | struct file { | |
66 | unsigned long max_blocks; | |
67 | unsigned pending_ios; | |
68 | int real_fd; | |
69 | int fixed_fd; | |
70 | }; | |
71 | ||
72 | struct submitter { | |
73 | pthread_t thread; | |
74 | int ring_fd; | |
75 | struct drand48_data rand; | |
76 | struct io_sq_ring sq_ring; | |
77 | struct io_uring_sqe *sqes; | |
78 | struct iovec iovecs[DEPTH]; | |
79 | struct io_cq_ring cq_ring; | |
80 | int inflight; | |
81 | unsigned long reaps; | |
82 | unsigned long done; | |
83 | unsigned long calls; | |
21b4aa5d JA |
84 | volatile int finish; |
85 | ||
86 | __s32 *fds; | |
87 | ||
88 | struct file files[MAX_FDS]; | |
89 | unsigned nr_files; | |
90 | unsigned cur_file; | |
91 | }; | |
92 | ||
93 | static struct submitter submitters[1]; | |
94 | static volatile int finish; | |
95 | ||
96 | /* | |
97 | * OPTIONS: Set these to test the various features of io_uring. | |
98 | */ | |
99 | static int polled = 1; /* use IO polling */ | |
100 | static int fixedbufs = 1; /* use fixed user buffers */ | |
101 | static int register_files = 1; /* use fixed files */ | |
102 | static int buffered = 0; /* use buffered IO, not O_DIRECT */ | |
103 | static int sq_thread_poll = 0; /* use kernel submission/poller thread */ | |
104 | static int sq_thread_cpu = -1; /* pin above thread to this CPU */ | |
105 | static int do_nop = 0; /* no-op SQ ring commands */ | |
106 | ||
107 | static int io_uring_register_buffers(struct submitter *s) | |
108 | { | |
109 | if (do_nop) | |
110 | return 0; | |
111 | ||
112 | return io_uring_register(s->ring_fd, IORING_REGISTER_BUFFERS, s->iovecs, | |
113 | DEPTH); | |
114 | } | |
115 | ||
116 | static int io_uring_register_files(struct submitter *s) | |
117 | { | |
118 | unsigned i; | |
119 | ||
120 | if (do_nop) | |
121 | return 0; | |
122 | ||
123 | s->fds = calloc(s->nr_files, sizeof(__s32)); | |
124 | for (i = 0; i < s->nr_files; i++) { | |
125 | s->fds[i] = s->files[i].real_fd; | |
126 | s->files[i].fixed_fd = i; | |
127 | } | |
128 | ||
129 | return io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fds, | |
130 | s->nr_files); | |
131 | } | |
132 | ||
133 | static int gettid(void) | |
134 | { | |
135 | return syscall(__NR_gettid); | |
136 | } | |
137 | ||
138 | static unsigned file_depth(struct submitter *s) | |
139 | { | |
140 | return (DEPTH + s->nr_files - 1) / s->nr_files; | |
141 | } | |
142 | ||
143 | static void init_io(struct submitter *s, unsigned index) | |
144 | { | |
145 | struct io_uring_sqe *sqe = &s->sqes[index]; | |
146 | unsigned long offset; | |
147 | struct file *f; | |
148 | long r; | |
149 | ||
150 | if (do_nop) { | |
151 | sqe->opcode = IORING_OP_NOP; | |
152 | return; | |
153 | } | |
154 | ||
155 | if (s->nr_files == 1) { | |
156 | f = &s->files[0]; | |
157 | } else { | |
158 | f = &s->files[s->cur_file]; | |
159 | if (f->pending_ios >= file_depth(s)) { | |
160 | s->cur_file++; | |
161 | if (s->cur_file == s->nr_files) | |
162 | s->cur_file = 0; | |
163 | f = &s->files[s->cur_file]; | |
164 | } | |
165 | } | |
166 | f->pending_ios++; | |
167 | ||
168 | lrand48_r(&s->rand, &r); | |
169 | offset = (r % (f->max_blocks - 1)) * BS; | |
170 | ||
171 | if (register_files) { | |
172 | sqe->flags = IOSQE_FIXED_FILE; | |
173 | sqe->fd = f->fixed_fd; | |
174 | } else { | |
175 | sqe->flags = 0; | |
176 | sqe->fd = f->real_fd; | |
177 | } | |
178 | if (fixedbufs) { | |
179 | sqe->opcode = IORING_OP_READ_FIXED; | |
180 | sqe->addr = (unsigned long) s->iovecs[index].iov_base; | |
181 | sqe->len = BS; | |
182 | sqe->buf_index = index; | |
183 | } else { | |
184 | sqe->opcode = IORING_OP_READV; | |
185 | sqe->addr = (unsigned long) &s->iovecs[index]; | |
186 | sqe->len = 1; | |
187 | sqe->buf_index = 0; | |
188 | } | |
189 | sqe->ioprio = 0; | |
190 | sqe->off = offset; | |
191 | sqe->user_data = (unsigned long) f; | |
192 | } | |
193 | ||
194 | static int prep_more_ios(struct submitter *s, unsigned max_ios) | |
195 | { | |
196 | struct io_sq_ring *ring = &s->sq_ring; | |
197 | unsigned index, tail, next_tail, prepped = 0; | |
198 | ||
199 | next_tail = tail = *ring->tail; | |
200 | do { | |
201 | next_tail++; | |
202 | read_barrier(); | |
203 | if (next_tail == *ring->head) | |
204 | break; | |
205 | ||
206 | index = tail & sq_ring_mask; | |
207 | init_io(s, index); | |
208 | ring->array[index] = index; | |
209 | prepped++; | |
210 | tail = next_tail; | |
211 | } while (prepped < max_ios); | |
212 | ||
213 | if (*ring->tail != tail) { | |
214 | /* order tail store with writes to sqes above */ | |
215 | write_barrier(); | |
216 | *ring->tail = tail; | |
217 | write_barrier(); | |
218 | } | |
219 | return prepped; | |
220 | } | |
221 | ||
222 | static int get_file_size(struct file *f) | |
223 | { | |
224 | struct stat st; | |
225 | ||
226 | if (fstat(f->real_fd, &st) < 0) | |
227 | return -1; | |
228 | if (S_ISBLK(st.st_mode)) { | |
229 | unsigned long long bytes; | |
230 | ||
231 | if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0) | |
232 | return -1; | |
233 | ||
234 | f->max_blocks = bytes / BS; | |
235 | return 0; | |
236 | } else if (S_ISREG(st.st_mode)) { | |
237 | f->max_blocks = st.st_size / BS; | |
238 | return 0; | |
239 | } | |
240 | ||
241 | return -1; | |
242 | } | |
243 | ||
244 | static int reap_events(struct submitter *s) | |
245 | { | |
246 | struct io_cq_ring *ring = &s->cq_ring; | |
247 | struct io_uring_cqe *cqe; | |
248 | unsigned head, reaped = 0; | |
249 | ||
250 | head = *ring->head; | |
251 | do { | |
252 | struct file *f; | |
253 | ||
254 | read_barrier(); | |
255 | if (head == *ring->tail) | |
256 | break; | |
257 | cqe = &ring->cqes[head & cq_ring_mask]; | |
258 | if (!do_nop) { | |
259 | f = (struct file *) (uintptr_t) cqe->user_data; | |
260 | f->pending_ios--; | |
261 | if (cqe->res != BS) { | |
262 | printf("io: unexpected ret=%d\n", cqe->res); | |
263 | if (polled && cqe->res == -EOPNOTSUPP) | |
264 | printf("Your filesystem doesn't support poll\n"); | |
265 | return -1; | |
266 | } | |
267 | } | |
21b4aa5d JA |
268 | reaped++; |
269 | head++; | |
270 | } while (1); | |
271 | ||
272 | s->inflight -= reaped; | |
273 | *ring->head = head; | |
274 | write_barrier(); | |
275 | return reaped; | |
276 | } | |
277 | ||
278 | static void *submitter_fn(void *data) | |
279 | { | |
280 | struct submitter *s = data; | |
281 | struct io_sq_ring *ring = &s->sq_ring; | |
282 | int ret, prepped; | |
283 | ||
284 | printf("submitter=%d\n", gettid()); | |
285 | ||
286 | srand48_r(pthread_self(), &s->rand); | |
287 | ||
288 | prepped = 0; | |
289 | do { | |
290 | int to_wait, to_submit, this_reap, to_prep; | |
291 | ||
292 | if (!prepped && s->inflight < DEPTH) { | |
293 | to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT); | |
294 | prepped = prep_more_ios(s, to_prep); | |
295 | } | |
296 | s->inflight += prepped; | |
297 | submit_more: | |
298 | to_submit = prepped; | |
299 | submit: | |
300 | if (to_submit && (s->inflight + to_submit <= DEPTH)) | |
301 | to_wait = 0; | |
302 | else | |
303 | to_wait = min(s->inflight + to_submit, BATCH_COMPLETE); | |
304 | ||
305 | /* | |
306 | * Only need to call io_uring_enter if we're not using SQ thread | |
307 | * poll, or if IORING_SQ_NEED_WAKEUP is set. | |
308 | */ | |
309 | if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) { | |
310 | unsigned flags = 0; | |
311 | ||
312 | if (to_wait) | |
313 | flags = IORING_ENTER_GETEVENTS; | |
314 | if ((*ring->flags & IORING_SQ_NEED_WAKEUP)) | |
315 | flags |= IORING_ENTER_SQ_WAKEUP; | |
316 | ret = io_uring_enter(s->ring_fd, to_submit, to_wait, | |
317 | flags, NULL); | |
318 | s->calls++; | |
319 | } | |
320 | ||
321 | /* | |
322 | * For non SQ thread poll, we already got the events we needed | |
323 | * through the io_uring_enter() above. For SQ thread poll, we | |
324 | * need to loop here until we find enough events. | |
325 | */ | |
326 | this_reap = 0; | |
327 | do { | |
328 | int r; | |
329 | r = reap_events(s); | |
330 | if (r == -1) { | |
331 | s->finish = 1; | |
332 | break; | |
333 | } else if (r > 0) | |
334 | this_reap += r; | |
335 | } while (sq_thread_poll && this_reap < to_wait); | |
336 | s->reaps += this_reap; | |
337 | ||
338 | if (ret >= 0) { | |
339 | if (!ret) { | |
340 | to_submit = 0; | |
341 | if (s->inflight) | |
342 | goto submit; | |
343 | continue; | |
344 | } else if (ret < to_submit) { | |
345 | int diff = to_submit - ret; | |
346 | ||
347 | s->done += ret; | |
348 | prepped -= diff; | |
349 | goto submit_more; | |
350 | } | |
351 | s->done += ret; | |
352 | prepped = 0; | |
353 | continue; | |
354 | } else if (ret < 0) { | |
355 | if (errno == EAGAIN) { | |
356 | if (s->finish) | |
357 | break; | |
358 | if (this_reap) | |
359 | goto submit; | |
360 | to_submit = 0; | |
361 | goto submit; | |
362 | } | |
363 | printf("io_submit: %s\n", strerror(errno)); | |
364 | break; | |
365 | } | |
366 | } while (!s->finish); | |
367 | ||
368 | finish = 1; | |
369 | return NULL; | |
370 | } | |
371 | ||
372 | static void sig_int(int sig) | |
373 | { | |
374 | printf("Exiting on signal %d\n", sig); | |
375 | submitters[0].finish = 1; | |
376 | finish = 1; | |
377 | } | |
378 | ||
379 | static void arm_sig_int(void) | |
380 | { | |
381 | struct sigaction act; | |
382 | ||
383 | memset(&act, 0, sizeof(act)); | |
384 | act.sa_handler = sig_int; | |
385 | act.sa_flags = SA_RESTART; | |
386 | sigaction(SIGINT, &act, NULL); | |
387 | } | |
388 | ||
389 | static int setup_ring(struct submitter *s) | |
390 | { | |
391 | struct io_sq_ring *sring = &s->sq_ring; | |
392 | struct io_cq_ring *cring = &s->cq_ring; | |
393 | struct io_uring_params p; | |
394 | int ret, fd; | |
395 | void *ptr; | |
396 | ||
397 | memset(&p, 0, sizeof(p)); | |
398 | ||
399 | if (polled && !do_nop) | |
400 | p.flags |= IORING_SETUP_IOPOLL; | |
401 | if (sq_thread_poll) { | |
402 | p.flags |= IORING_SETUP_SQPOLL; | |
403 | if (sq_thread_cpu != -1) { | |
404 | p.flags |= IORING_SETUP_SQ_AFF; | |
405 | p.sq_thread_cpu = sq_thread_cpu; | |
406 | } | |
407 | } | |
408 | ||
409 | fd = io_uring_setup(DEPTH, &p); | |
410 | if (fd < 0) { | |
411 | perror("io_uring_setup"); | |
412 | return 1; | |
413 | } | |
414 | s->ring_fd = fd; | |
415 | ||
416 | if (fixedbufs) { | |
417 | ret = io_uring_register_buffers(s); | |
418 | if (ret < 0) { | |
419 | perror("io_uring_register_buffers"); | |
420 | return 1; | |
421 | } | |
422 | } | |
423 | ||
424 | if (register_files) { | |
425 | ret = io_uring_register_files(s); | |
426 | if (ret < 0) { | |
427 | perror("io_uring_register_files"); | |
428 | return 1; | |
429 | } | |
430 | } | |
431 | ||
432 | ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32), | |
433 | PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, | |
434 | IORING_OFF_SQ_RING); | |
435 | printf("sq_ring ptr = 0x%p\n", ptr); | |
436 | sring->head = ptr + p.sq_off.head; | |
437 | sring->tail = ptr + p.sq_off.tail; | |
438 | sring->ring_mask = ptr + p.sq_off.ring_mask; | |
439 | sring->ring_entries = ptr + p.sq_off.ring_entries; | |
440 | sring->flags = ptr + p.sq_off.flags; | |
441 | sring->array = ptr + p.sq_off.array; | |
442 | sq_ring_mask = *sring->ring_mask; | |
443 | ||
444 | s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), | |
445 | PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, | |
446 | IORING_OFF_SQES); | |
447 | printf("sqes ptr = 0x%p\n", s->sqes); | |
448 | ||
449 | ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe), | |
450 | PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, | |
451 | IORING_OFF_CQ_RING); | |
452 | printf("cq_ring ptr = 0x%p\n", ptr); | |
453 | cring->head = ptr + p.cq_off.head; | |
454 | cring->tail = ptr + p.cq_off.tail; | |
455 | cring->ring_mask = ptr + p.cq_off.ring_mask; | |
456 | cring->ring_entries = ptr + p.cq_off.ring_entries; | |
457 | cring->cqes = ptr + p.cq_off.cqes; | |
458 | cq_ring_mask = *cring->ring_mask; | |
459 | return 0; | |
460 | } | |
461 | ||
462 | static void file_depths(char *buf) | |
463 | { | |
464 | struct submitter *s = &submitters[0]; | |
465 | unsigned i; | |
466 | char *p; | |
467 | ||
468 | buf[0] = '\0'; | |
469 | p = buf; | |
470 | for (i = 0; i < s->nr_files; i++) { | |
471 | struct file *f = &s->files[i]; | |
472 | ||
473 | if (i + 1 == s->nr_files) | |
474 | p += sprintf(p, "%d", f->pending_ios); | |
475 | else | |
476 | p += sprintf(p, "%d, ", f->pending_ios); | |
477 | } | |
478 | } | |
479 | ||
480 | int main(int argc, char *argv[]) | |
481 | { | |
482 | struct submitter *s = &submitters[0]; | |
70423667 | 483 | unsigned long done, calls, reap; |
21b4aa5d JA |
484 | int err, i, flags, fd; |
485 | char *fdepths; | |
486 | void *ret; | |
487 | ||
488 | if (!do_nop && argc < 2) { | |
489 | printf("%s: filename\n", argv[0]); | |
490 | return 1; | |
491 | } | |
492 | ||
493 | flags = O_RDONLY | O_NOATIME; | |
494 | if (!buffered) | |
495 | flags |= O_DIRECT; | |
496 | ||
497 | i = 1; | |
498 | while (!do_nop && i < argc) { | |
499 | struct file *f; | |
500 | ||
501 | if (s->nr_files == MAX_FDS) { | |
502 | printf("Max number of files (%d) reached\n", MAX_FDS); | |
503 | break; | |
504 | } | |
505 | fd = open(argv[i], flags); | |
506 | if (fd < 0) { | |
507 | perror("open"); | |
508 | return 1; | |
509 | } | |
510 | ||
511 | f = &s->files[s->nr_files]; | |
512 | f->real_fd = fd; | |
513 | if (get_file_size(f)) { | |
514 | printf("failed getting size of device/file\n"); | |
515 | return 1; | |
516 | } | |
517 | if (f->max_blocks <= 1) { | |
518 | printf("Zero file/device size?\n"); | |
519 | return 1; | |
520 | } | |
521 | f->max_blocks--; | |
522 | ||
523 | printf("Added file %s\n", argv[i]); | |
524 | s->nr_files++; | |
525 | i++; | |
526 | } | |
527 | ||
528 | if (fixedbufs) { | |
529 | struct rlimit rlim; | |
530 | ||
531 | rlim.rlim_cur = RLIM_INFINITY; | |
532 | rlim.rlim_max = RLIM_INFINITY; | |
533 | if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { | |
534 | perror("setrlimit"); | |
535 | return 1; | |
536 | } | |
537 | } | |
538 | ||
539 | arm_sig_int(); | |
540 | ||
541 | for (i = 0; i < DEPTH; i++) { | |
542 | void *buf; | |
543 | ||
544 | if (posix_memalign(&buf, BS, BS)) { | |
545 | printf("failed alloc\n"); | |
546 | return 1; | |
547 | } | |
548 | s->iovecs[i].iov_base = buf; | |
549 | s->iovecs[i].iov_len = BS; | |
550 | } | |
551 | ||
552 | err = setup_ring(s); | |
553 | if (err) { | |
554 | printf("ring setup failed: %s, %d\n", strerror(errno), err); | |
555 | return 1; | |
556 | } | |
557 | printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered); | |
558 | printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries); | |
559 | ||
560 | pthread_create(&s->thread, NULL, submitter_fn, s); | |
561 | ||
562 | fdepths = malloc(8 * s->nr_files); | |
70423667 | 563 | reap = calls = done = 0; |
21b4aa5d JA |
564 | do { |
565 | unsigned long this_done = 0; | |
566 | unsigned long this_reap = 0; | |
567 | unsigned long this_call = 0; | |
21b4aa5d | 568 | unsigned long rpc = 0, ipc = 0; |
21b4aa5d JA |
569 | |
570 | sleep(1); | |
571 | this_done += s->done; | |
572 | this_call += s->calls; | |
573 | this_reap += s->reaps; | |
21b4aa5d JA |
574 | if (this_call - calls) { |
575 | rpc = (this_done - done) / (this_call - calls); | |
576 | ipc = (this_reap - reap) / (this_call - calls); | |
577 | } else | |
578 | rpc = ipc = -1; | |
579 | file_depths(fdepths); | |
70423667 | 580 | printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n", |
21b4aa5d | 581 | this_done - done, rpc, ipc, s->inflight, |
70423667 | 582 | fdepths); |
21b4aa5d JA |
583 | done = this_done; |
584 | calls = this_call; | |
585 | reap = this_reap; | |
21b4aa5d JA |
586 | } while (!finish); |
587 | ||
588 | pthread_join(s->thread, &ret); | |
589 | close(s->ring_fd); | |
590 | free(fdepths); | |
591 | return 0; | |
592 | } |