6405536006ce3ff7386d6da26137196b2c770ca0
[fio.git] / t / io_uring.c
1 #include <stdio.h>
2 #include <errno.h>
3 #include <assert.h>
4 #include <stdlib.h>
5 #include <stddef.h>
6 #include <signal.h>
7 #include <inttypes.h>
8
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <sys/ioctl.h>
12 #include <sys/syscall.h>
13 #include <sys/resource.h>
14 #include <sys/mman.h>
15 #include <sys/uio.h>
16 #include <linux/fs.h>
17 #include <fcntl.h>
18 #include <unistd.h>
19 #include <string.h>
20 #include <pthread.h>
21 #include <sched.h>
22
23 #include "../arch/arch.h"
24 #include "../lib/types.h"
25 #include "../os/io_uring.h"
26
27 #define barrier()       __asm__ __volatile__("": : :"memory")
28
29 #define min(a, b)               ((a < b) ? (a) : (b))
30
31 struct io_sq_ring {
32         unsigned *head;
33         unsigned *tail;
34         unsigned *ring_mask;
35         unsigned *ring_entries;
36         unsigned *array;
37 };
38
39 struct io_cq_ring {
40         unsigned *head;
41         unsigned *tail;
42         unsigned *ring_mask;
43         unsigned *ring_entries;
44         struct io_uring_event *events;
45 };
46
47 #define DEPTH                   32
48
49 #define BATCH_SUBMIT            8
50 #define BATCH_COMPLETE          8
51
52 #define BS                      4096
53
54 static unsigned sq_ring_mask, cq_ring_mask;
55
56 struct submitter {
57         pthread_t thread;
58         unsigned long max_blocks;
59         int ring_fd;
60         struct drand48_data rand;
61         struct io_sq_ring sq_ring;
62         struct io_uring_iocb *iocbs;
63         struct iovec iovecs[DEPTH];
64         struct io_cq_ring cq_ring;
65         int inflight;
66         unsigned long reaps;
67         unsigned long done;
68         unsigned long calls;
69         unsigned long cachehit, cachemiss;
70         volatile int finish;
71         char filename[128];
72 };
73
74 static struct submitter submitters[1];
75 static volatile int finish;
76
77 static int polled = 0;          /* use IO polling */
78 static int fixedbufs = 0;       /* use fixed user buffers */
79 static int buffered = 1;        /* use buffered IO, not O_DIRECT */
80 static int sq_thread = 0;       /* use kernel submission thread */
81 static int sq_thread_cpu = 0;   /* pin above thread to this CPU */
82
83 static int io_uring_setup(unsigned entries, struct iovec *iovecs,
84                           struct io_uring_params *p)
85 {
86         return syscall(__NR_sys_io_uring_setup, entries, iovecs, p);
87 }
88
89 static int io_uring_enter(struct submitter *s, unsigned int to_submit,
90                           unsigned int min_complete, unsigned int flags)
91 {
92         return syscall(__NR_sys_io_uring_enter, s->ring_fd, to_submit,
93                         min_complete, flags);
94 }
95
96 static int gettid(void)
97 {
98         return syscall(__NR_gettid);
99 }
100
101 static void init_io(struct submitter *s, int fd, unsigned index)
102 {
103         struct io_uring_iocb *iocb = &s->iocbs[index];
104         unsigned long offset;
105         long r;
106
107         lrand48_r(&s->rand, &r);
108         offset = (r % (s->max_blocks - 1)) * BS;
109
110         iocb->opcode = IORING_OP_READ;
111         iocb->flags = 0;
112         iocb->ioprio = 0;
113         iocb->fd = fd;
114         iocb->off = offset;
115         iocb->addr = s->iovecs[index].iov_base;
116         iocb->len = BS;
117 }
118
119 static int prep_more_ios(struct submitter *s, int fd, int max_ios)
120 {
121         struct io_sq_ring *ring = &s->sq_ring;
122         unsigned index, tail, next_tail, prepped = 0;
123
124         next_tail = tail = *ring->tail;
125         do {
126                 next_tail++;
127                 barrier();
128                 if (next_tail == *ring->head)
129                         break;
130
131                 index = tail & sq_ring_mask;
132                 init_io(s, fd, index);
133                 ring->array[index] = index;
134                 prepped++;
135                 tail = next_tail;
136         } while (prepped < max_ios);
137
138         if (*ring->tail != tail) {
139                 /* order tail store with writes to iocbs above */
140                 barrier();
141                 *ring->tail = tail;
142                 barrier();
143         }
144         return prepped;
145 }
146
147 static int get_file_size(int fd, unsigned long *blocks)
148 {
149         struct stat st;
150
151         if (fstat(fd, &st) < 0)
152                 return -1;
153         if (S_ISBLK(st.st_mode)) {
154                 unsigned long long bytes;
155
156                 if (ioctl(fd, BLKGETSIZE64, &bytes) != 0)
157                         return -1;
158
159                 *blocks = bytes / BS;
160                 return 0;
161         } else if (S_ISREG(st.st_mode)) {
162                 *blocks = st.st_size / BS;
163                 return 0;
164         }
165
166         return -1;
167 }
168
169 static int reap_events(struct submitter *s)
170 {
171         struct io_cq_ring *ring = &s->cq_ring;
172         struct io_uring_event *ev;
173         unsigned head, reaped = 0;
174
175         head = *ring->head;
176         do {
177                 barrier();
178                 if (head == *ring->tail)
179                         break;
180                 ev = &ring->events[head & cq_ring_mask];
181                 if (ev->res != BS) {
182                         struct io_uring_iocb *iocb = &s->iocbs[ev->index];
183
184                         printf("io: unexpected ret=%d\n", ev->res);
185                         printf("offset=%lu, size=%lu\n",
186                                         (unsigned long) iocb->off,
187                                         (unsigned long) iocb->len);
188                         return -1;
189                 }
190                 if (ev->flags & IOEV_FLAG_CACHEHIT)
191                         s->cachehit++;
192                 else
193                         s->cachemiss++;
194                 reaped++;
195                 head++;
196         } while (1);
197
198         s->inflight -= reaped;
199         *ring->head = head;
200         barrier();
201         return reaped;
202 }
203
204 static void *submitter_fn(void *data)
205 {
206         struct submitter *s = data;
207         int fd, ret, prepped, flags;
208
209         printf("submitter=%d\n", gettid());
210
211         flags = O_RDONLY;
212         if (!buffered)
213                 flags |= O_DIRECT;
214         fd = open(s->filename, flags);
215         if (fd < 0) {
216                 perror("open");
217                 goto done;
218         }
219
220         if (get_file_size(fd, &s->max_blocks)) {
221                 printf("failed getting size of device/file\n");
222                 goto err;
223         }
224         if (s->max_blocks <= 1) {
225                 printf("Zero file/device size?\n");
226                 goto err;
227         }
228         s->max_blocks--;
229
230         srand48_r(pthread_self(), &s->rand);
231
232         prepped = 0;
233         do {
234                 int to_wait, to_submit, this_reap, to_prep;
235
236                 if (!prepped && s->inflight < DEPTH) {
237                         to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT);
238                         prepped = prep_more_ios(s, fd, to_prep);
239                 }
240                 s->inflight += prepped;
241 submit_more:
242                 to_submit = prepped;
243 submit:
244                 if (s->inflight + BATCH_SUBMIT < DEPTH)
245                         to_wait = 0;
246                 else
247                         to_wait = min(s->inflight + to_submit, BATCH_COMPLETE);
248
249                 ret = io_uring_enter(s, to_submit, to_wait,
250                                         IORING_ENTER_GETEVENTS);
251                 s->calls++;
252
253                 this_reap = reap_events(s);
254                 if (this_reap == -1)
255                         break;
256                 s->reaps += this_reap;
257
258                 if (ret >= 0) {
259                         if (!ret) {
260                                 to_submit = 0;
261                                 if (s->inflight)
262                                         goto submit;
263                                 continue;
264                         } else if (ret < to_submit) {
265                                 int diff = to_submit - ret;
266
267                                 s->done += ret;
268                                 prepped -= diff;
269                                 goto submit_more;
270                         }
271                         s->done += ret;
272                         prepped = 0;
273                         continue;
274                 } else if (ret < 0) {
275                         if (errno == EAGAIN) {
276                                 if (s->finish)
277                                         break;
278                                 if (this_reap)
279                                         goto submit;
280                                 to_submit = 0;
281                                 goto submit;
282                         }
283                         printf("io_submit: %s\n", strerror(errno));
284                         break;
285                 }
286         } while (!s->finish);
287 err:
288         close(fd);
289 done:
290         finish = 1;
291         return NULL;
292 }
293
294 static void sig_int(int sig)
295 {
296         printf("Exiting on signal %d\n", sig);
297         submitters[0].finish = 1;
298         finish = 1;
299 }
300
301 static void arm_sig_int(void)
302 {
303         struct sigaction act;
304
305         memset(&act, 0, sizeof(act));
306         act.sa_handler = sig_int;
307         act.sa_flags = SA_RESTART;
308         sigaction(SIGINT, &act, NULL);
309 }
310
311 static int setup_ring(struct submitter *s)
312 {
313         struct io_sq_ring *sring = &s->sq_ring;
314         struct io_cq_ring *cring = &s->cq_ring;
315         struct io_uring_params p;
316         void *ptr;
317         int fd;
318
319         memset(&p, 0, sizeof(p));
320
321         if (polled)
322                 p.flags |= IORING_SETUP_IOPOLL;
323         if (fixedbufs)
324                 p.flags |= IORING_SETUP_FIXEDBUFS;
325         if (buffered)
326                 p.flags |= IORING_SETUP_SQWQ;
327         else if (sq_thread) {
328                 p.flags |= IORING_SETUP_SQTHREAD;
329                 p.sq_thread_cpu = sq_thread_cpu;
330         }
331
332         if (fixedbufs)
333                 fd = io_uring_setup(DEPTH, s->iovecs, &p);
334         else
335                 fd = io_uring_setup(DEPTH, NULL, &p);
336         if (fd < 0) {
337                 perror("io_uring_setup");
338                 return 1;
339         }
340
341         s->ring_fd = fd;
342         ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
343                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
344                         IORING_OFF_SQ_RING);
345         printf("sq_ring ptr = 0x%p\n", ptr);
346         sring->head = ptr + p.sq_off.head;
347         sring->tail = ptr + p.sq_off.tail;
348         sring->ring_mask = ptr + p.sq_off.ring_mask;
349         sring->ring_entries = ptr + p.sq_off.ring_entries;
350         sring->array = ptr + p.sq_off.array;
351         sq_ring_mask = *sring->ring_mask;
352
353         s->iocbs = mmap(0, p.sq_entries * sizeof(struct io_uring_iocb),
354                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
355                         IORING_OFF_IOCB);
356         printf("iocbs ptr   = 0x%p\n", s->iocbs);
357
358         ptr = mmap(0, p.cq_off.events + p.cq_entries * sizeof(struct io_uring_event),
359                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
360                         IORING_OFF_CQ_RING);
361         printf("cq_ring ptr = 0x%p\n", ptr);
362         cring->head = ptr + p.cq_off.head;
363         cring->tail = ptr + p.cq_off.tail;
364         cring->ring_mask = ptr + p.cq_off.ring_mask;
365         cring->ring_entries = ptr + p.cq_off.ring_entries;
366         cring->events = ptr + p.cq_off.events;
367         cq_ring_mask = *cring->ring_mask;
368         return 0;
369 }
370
371 int main(int argc, char *argv[])
372 {
373         struct submitter *s = &submitters[0];
374         unsigned long done, calls, reap, cache_hit, cache_miss;
375         int err, i;
376         struct rlimit rlim;
377         void *ret;
378
379         if (argc < 2) {
380                 printf("%s: filename\n", argv[0]);
381                 return 1;
382         }
383
384         rlim.rlim_cur = RLIM_INFINITY;
385         rlim.rlim_max = RLIM_INFINITY;
386         if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
387                 perror("setrlimit");
388                 return 1;
389         }
390
391         arm_sig_int();
392
393         for (i = 0; i < DEPTH; i++) {
394                 void *buf;
395
396                 if (posix_memalign(&buf, BS, BS)) {
397                         printf("failed alloc\n");
398                         return 1;
399                 }
400                 s->iovecs[i].iov_base = buf;
401                 s->iovecs[i].iov_len = BS;
402         }
403
404         err = setup_ring(s);
405         if (err) {
406                 printf("ring setup failed: %s, %d\n", strerror(errno), err);
407                 return 1;
408         }
409         printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered);
410         printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
411         strcpy(s->filename, argv[1]);
412
413         pthread_create(&s->thread, NULL, submitter_fn, s);
414
415         cache_hit = cache_miss = reap = calls = done = 0;
416         do {
417                 unsigned long this_done = 0;
418                 unsigned long this_reap = 0;
419                 unsigned long this_call = 0;
420                 unsigned long this_cache_hit = 0;
421                 unsigned long this_cache_miss = 0;
422                 unsigned long rpc = 0, ipc = 0;
423                 double hit = 0.0;
424
425                 sleep(1);
426                 this_done += s->done;
427                 this_call += s->calls;
428                 this_reap += s->reaps;
429                 this_cache_hit += s->cachehit;
430                 this_cache_miss += s->cachemiss;
431                 if (this_cache_hit && this_cache_miss) {
432                         unsigned long hits, total;
433
434                         hits = this_cache_hit - cache_hit;
435                         total = hits + this_cache_miss - cache_miss;
436                         hit = (double) hits / (double) total;
437                         hit *= 100.0;
438                 }
439                 if (this_call - calls) {
440                         rpc = (this_done - done) / (this_call - calls);
441                         ipc = (this_reap - reap) / (this_call - calls);
442                 }
443                 printf("IOPS=%lu, IOS/call=%lu/%lu, inflight=%u (head=%u tail=%u), Cachehit=%0.2f%%\n",
444                                 this_done - done, rpc, ipc, s->inflight,
445                                 *s->cq_ring.head, *s->cq_ring.tail, hit);
446                 done = this_done;
447                 calls = this_call;
448                 reap = this_reap;
449                 cache_hit = s->cachehit;
450                 cache_miss = s->cachemiss;
451         } while (!finish);
452
453         pthread_join(s->thread, &ret);
454         close(s->ring_fd);
455         return 0;
456 }