1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
|
.\" Copyright (C) 2020 Shuveb Hussain <shuveb@gmail.com>
.\" SPDX-License-Identifier: LGPL-2.0-or-later
.\"
.TH io_uring 7 2020-07-26 "Linux" "Linux Programmer's Manual"
.SH NAME
io_uring \- Asynchronous I/O facility
.SH SYNOPSIS
.nf
.B "#include <linux/io_uring.h>"
.fi
.PP
.SH DESCRIPTION
.PP
.B io_uring
is a Linux-specific API for asynchronous I/O.
It allows the user to submit one or more I/O requests,
which are processed asynchronously without blocking the calling process.
.B io_uring
gets its name from ring buffers which are shared between user space and
kernel space. This arrangement allows for efficient I/O,
while avoiding the overhead of copying buffers between them,
where possible.
This interface makes
.B io_uring
different from other UNIX I/O APIs,
wherein,
rather than just communicate between kernel and user space with system calls,
ring buffers are used as the main mode of communication.
This arrangement has various performance benefits which are discussed in a
separate section below.
This man page uses the terms shared buffers, shared ring buffers and
queues interchangeably.
.PP
The general programming model you need to follow for
.B io_uring
is outlined below
.IP \(bu
Set up shared buffers with
.BR io_uring_setup (2)
and
.BR mmap (2),
mapping into user space shared buffers for the submission queue (SQ) and the
completion queue (CQ).
You place I/O requests you want to make on the SQ,
while the kernel places the results of those operations on the CQ.
.IP \(bu
For every I/O request you need to make (like to read a file, write a file,
accept a socket connection, etc), you create a submission queue entry,
or SQE,
describe the I/O operation you need to get done and add it to the tail of
the submission queue (SQ).
Each I/O operation is,
in essence,
the equivalent of a system call you would have made otherwise,
if you were not using
.BR io_uring .
For instance,
a SQE with the
.I opcode
set to
.B IORING_OP_READ
will request a read operation to be issued that is similar to the
.BR read (2)
system call. Refer to the opcode documentation in
.BR io_uring_enter (2)
for all supported opcodes and their properties.
You can add more than one SQE to the queue depending on the number of
operations you want to request.
.IP \(bu
After you add one or more SQEs,
you need to call
.BR io_uring_enter (2)
to tell the kernel to dequeue your I/O requests off the SQ and begin
processing them.
.IP \(bu
For each SQE you submit,
once it is done processing the request,
the kernel places a completion queue event or CQE at the tail of the
completion queue or CQ.
The kernel places exactly one matching CQE in the CQ for every SQE you
submit on the SQ.
After you retrieve a CQE,
minimally,
you might be interested in checking the
.I res
field of the CQE structure,
which corresponds to the return value of the system
call's equivalent,
had you used it directly without using
.BR io_uring .
Given that
.B io_uring
is an async interface,
.I errno
is never used for passing back error information. Instead,
.I res
will contain what the equivalent system call would have returned in case
of success, and in case of error
.I res
will contain
.IR -errno .
For example, if the normal read system call would have returned -1 and set
.I errno
to
.BR EINVAL ,
then
.I res
would contain
.BR -EINVAL .
If the normal system call would have returned a read size of 1024, then
.I res
would contain 1024.
.IP \(bu
Optionally,
.BR io_uring_enter (2)
can also wait for a specified number of requests to be processed by the kernel
before it returns.
If you specified a certain number of completions to wait for,
the kernel would have placed at least those many number of CQEs on the CQ,
which you can then readily read,
right after the return from
.BR io_uring_enter (2).
.IP \(bu
It is important to remember that I/O requests submitted to the kernel can
complete in any order.
It is not necessary for the kernel to process one request after another,
in the order you placed them.
Given that the interface is a ring,
the requests are attempted in order,
however that doesn't imply any sort of ordering on their completion.
When more than one request is in flight,
it is not possible to determine which one will complete first.
When you dequeue CQEs off the CQ,
you should always check which submitted request it corresponds to.
The most common method for doing so is utilizing the
.I user_data
field in the request, which is passed back on the completion side.
.PP
Adding to and reading from the queues:
.IP \(bu
You add SQEs to the tail of the SQ.
The kernel reads SQEs off the head of the queue.
.IP \(bu
The kernel adds CQEs to the tail of the CQ.
You read CQEs off the head of the queue.
.PP
It should be noted that depending on the configuration io_uring's behavior
can deviate from the behavior outlined above, like not posting a CQE for
every SQE when setting
.B IOSQE_CQE_SKIP_SUCCESS
in the SQE or posting multiple CQEs for a single SQE for multi shot operations
or requiring an
.BR io_uring_enter (2)
syscall to make the kernel begin processing newly added SQEs when using
submission queue polling.
.SS Submission queue polling
One of the goals of
.B io_uring
is to provide a means for efficient I/O.
To this end,
.B io_uring
supports a polling mode that lets you avoid the call to
.BR io_uring_enter (2),
which you use to inform the kernel that you have queued SQEs on to the SQ.
With SQ Polling,
.B io_uring
starts a kernel thread that polls the submission queue for any I/O
requests you submit by adding SQEs.
With SQ Polling enabled,
there is no need for you to call
.BR io_uring_enter (2),
letting you avoid the overhead of system calls.
A designated kernel thread dequeues SQEs off the SQ as you add them and
dispatches them for asynchronous processing.
.SS Setting up io_uring
.PP
The main steps in setting up
.B io_uring
consist of mapping in the shared buffers with
.BR mmap (2)
calls.
In the example program included in this man page,
the function
.BR app_setup_uring ()
sets up
.B io_uring
with a QUEUE_DEPTH deep submission queue.
Pay attention to the 2
.BR mmap (2)
calls that set up the shared submission and completion queues.
If your kernel is older than version 5.4,
three
.BR mmap(2)
calls are required.
.PP
.SS Submitting I/O requests
The process of submitting a request consists of describing the I/O
operation you need to get done using an
.B io_uring_sqe
structure instance.
These details describe the equivalent system call and its parameters.
Because the range of I/O operations Linux supports are very varied and the
.B io_uring_sqe
structure needs to be able to describe them,
it has several fields,
some packed into unions for space efficiency.
Here is a simplified version of struct
.B io_uring_sqe
with some of the most often used fields:
.PP
.in +4n
.EX
struct io_uring_sqe {
__u8 opcode; /* type of operation for this sqe */
__s32 fd; /* file descriptor to do IO on */
__u64 off; /* offset into file */
__u64 addr; /* pointer to buffer or iovecs */
__u32 len; /* buffer size or number of iovecs */
__u64 user_data; /* data to be passed back at completion time */
__u8 flags; /* IOSQE_ flags */
...
};
.EE
.in
Here is struct
.B io_uring_sqe
in full:
.in +4n
.EX
struct io_uring_sqe {
__u8 opcode; /* type of operation for this sqe */
__u8 flags; /* IOSQE_ flags */
__u16 ioprio; /* ioprio for the request */
__s32 fd; /* file descriptor to do IO on */
union {
__u64 off; /* offset into file */
__u64 addr2;
struct {
__u32 cmd_op;
__u32 __pad1;
};
};
union {
__u64 addr; /* pointer to buffer or iovecs */
__u64 splice_off_in;
struct {
__u32 level;
__u32 optname;
};
};
__u32 len; /* buffer size or number of iovecs */
union {
__kernel_rwf_t rw_flags;
__u32 fsync_flags;
__u16 poll_events; /* compatibility */
__u32 poll32_events; /* word-reversed for BE */
__u32 sync_range_flags;
__u32 msg_flags;
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
__u32 xattr_flags;
__u32 msg_ring_flags;
__u32 uring_cmd_flags;
__u32 waitid_flags;
__u32 futex_flags;
__u32 install_fd_flags;
__u32 nop_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
union {
/* index into fixed buffers, if used */
__u16 buf_index;
/* for grouped buffer selection */
__u16 buf_group;
} __attribute__((packed));
/* personality to use, if used */
__u16 personality;
union {
__s32 splice_fd_in;
__u32 file_index;
__u32 optlen;
struct {
__u16 addr_len;
__u16 __pad3[1];
};
};
union {
struct {
__u64 addr3;
__u64 __pad2[1];
};
__u64 optval;
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
* this field is used for 80 bytes of arbitrary command data
*/
__u8 cmd[0];
};
};
.EE
.in
.PP
To submit an I/O request to
.BR io_uring ,
you need to acquire a submission queue entry (SQE) from the submission
queue (SQ),
fill it up with details of the operation you want to submit and call
.BR io_uring_enter (2).
There are helper functions of the form io_uring_prep_X to enable proper
setup of the SQE. If you want to avoid calling
.BR io_uring_enter (2),
you have the option of setting up Submission Queue Polling.
.PP
SQEs are added to the tail of the submission queue.
The kernel picks up SQEs off the head of the SQ.
The general algorithm to get the next available SQE and update the tail is
as follows.
.PP
.in +4n
.EX
struct io_uring_sqe *sqe;
unsigned tail, index;
tail = *sqring->tail;
index = tail & (*sqring->ring_mask);
sqe = &sqring->sqes[index];
/* fill up details about this I/O request */
describe_io(sqe);
/* fill the sqe index into the SQ ring array */
sqring->array[index] = index;
tail++;
atomic_store_explicit(sqring->tail, tail, memory_order_release);
.EE
.in
.PP
To get the index of an entry,
the application must mask the current tail index with the size mask of the
ring.
This holds true for both SQs and CQs.
Once the SQE is acquired,
the necessary fields are filled in,
describing the request.
While the CQ ring directly indexes the shared array of CQEs,
the submission side has an indirection array between them.
The submission side ring buffer is an index into this array,
which in turn contains the index into the SQEs.
.PP
The following code snippet demonstrates how a read operation,
an equivalent of a
.BR preadv2 (2)
system call is described by filling up an SQE with the necessary
parameters.
.PP
.in +4n
.EX
struct iovec iovecs[16];
...
sqe->opcode = IORING_OP_READV;
sqe->fd = fd;
sqe->addr = (unsigned long) iovecs;
sqe->len = 16;
sqe->off = offset;
sqe->flags = 0;
.EE
.in
.TP
.B Memory ordering
Modern compilers and CPUs freely reorder reads and writes without
affecting the program's outcome to optimize performance.
Some aspects of this need to be kept in mind on SMP systems since
.B io_uring
involves buffers shared between kernel and user space.
These buffers are both visible and modifiable from kernel and user space.
As heads and tails belonging to these shared buffers are updated by kernel
and user space,
changes need to be coherently visible on either side,
irrespective of whether a CPU switch took place after the kernel-user mode
switch happened.
We use memory barriers to enforce this coherency.
Being significantly large subjects on their own,
memory barriers are out of scope for further discussion on this man page.
For more information on modern memory models the reader may refer to the
Documentation/memory-barriers.txt in the kernel tree or to the documentation
of the formal C11 or kernel memory model.
.TP
.B Letting the kernel know about I/O submissions
Once you place one or more SQEs on to the SQ,
you need to let the kernel know that you've done so.
You can do this by calling the
.BR io_uring_enter (2)
system call.
This system call is also capable of waiting for a specified count of
events to complete.
This way,
you can be sure to find completion events in the completion queue without
having to poll it for events later.
.SS Reading completion events
Similar to the submission queue (SQ),
the completion queue (CQ) is a shared buffer between the kernel and user
space.
Whereas you placed submission queue entries on the tail of the SQ and the
kernel read off the head,
when it comes to the CQ,
the kernel places completion queue events or CQEs on the tail of the CQ and
you read off its head.
.PP
Submission is flexible (and thus a bit more complicated) since it needs to
be able to encode different types of system calls that take various
parameters.
Completion,
on the other hand is simpler since we're looking only for a return value
back from the kernel.
This is easily understood by looking at the completion queue event
structure,
struct
.BR io_uring_cqe :
.PP
.in +4n
.EX
struct io_uring_cqe {
__u64 user_data; /* sqe->data submission passed back */
__s32 res; /* result code for this event */
__u32 flags;
};
.EE
.in
.PP
Here,
.I user_data
is custom data that is passed unchanged from submission to completion.
That is,
from SQEs to CQEs.
This field can be used to set context,
uniquely identifying submissions that got completed.
Given that I/O requests can complete in any order,
this field can be used to correlate a submission with a completion.
.I res
is the result from the system call that was performed as part of the
submission;
its return value.
The
.I flags
field carries request-specific information. As of the 6.0 kernel, the following
flags are defined:
.TP
.B IORING_CQE_F_BUFFER
If set, the upper 16 bits of the flags field carries the buffer ID that was
chosen for this request. The request must have been issued with
.B IOSQE_BUFFER_SELECT
set, and used with a request type that supports buffer selection. Additionally,
buffers must have been provided upfront either via the
.B IORING_OP_PROVIDE_BUFFERS
or the
.B IORING_REGISTER_PBUF_RING
methods.
.TP
.B IORING_CQE_F_MORE
If set, the application should expect more completions from the request. This
is used for requests that can generate multiple completions, such as multi-shot
requests, receive, or accept.
.TP
.B IORING_CQE_F_SOCK_NONEMPTY
If set, upon receiving the data from the socket in the current request, the
socket still had data left on completion of this request.
.TP
.B IORING_CQE_F_NOTIF
Set for notification CQEs, as seen with the zero-copy networking send and
receive support.
.TP
.B IORING_CQE_F_BUF_MORE
If set, the buffer ID set in the completion will get more completions. This
means that the provided buffer has been partially consumed and there's more
buffer space left, and hence the application should expect more completions
with this buffer ID. Each completion will continue where the previous one
left off. This can only happen if the provided buffer ring has been setup
with
.B IORING_REGISTER_PBUF_RING
to allow for incremental / partial consumption of buffers.
.PP
The general sequence to read completion events off the completion queue is
as follows:
.PP
.in +4n
.EX
unsigned head;
head = *cqring->head;
if (head != atomic_load_acquire(cqring->tail)) {
struct io_uring_cqe *cqe;
unsigned index;
index = head & (cqring->mask);
cqe = &cqring->cqes[index];
/* process completed CQE */
process_cqe(cqe);
/* CQE consumption complete */
head++;
}
atomic_store_explicit(cqring->head, head, memory_order_release);
.EE
.in
.PP
It helps to be reminded that the kernel adds CQEs to the tail of the CQ,
while you need to dequeue them off the head.
To get the index of an entry at the head,
the application must mask the current head index with the size mask of the
ring.
Once the CQE has been consumed or processed,
the head needs to be updated to reflect the consumption of the CQE.
Attention should be paid to the read and write barriers to ensure
successful read and update of the head.
.SS io_uring performance
Because of the shared ring buffers between kernel and user space,
.B io_uring
can be a zero-copy system.
Copying buffers to and from becomes necessary when system calls that
transfer data between kernel and user space are involved.
But since the bulk of the communication in
.B io_uring
is via buffers shared between the kernel and user space,
this huge performance overhead is completely avoided.
.PP
While system calls may not seem like a significant overhead,
in high performance applications,
making a lot of them will begin to matter.
While workarounds the operating system has in place to deal with Spectre
and Meltdown are ideally best done away with,
unfortunately,
some of these workarounds are around the system call interface,
making system calls not as cheap as before on affected hardware.
While newer hardware should not need these workarounds,
hardware with these vulnerabilities can be expected to be in the wild for a
long time.
While using synchronous programming interfaces or even when using
asynchronous programming interfaces under Linux,
there is at least one system call involved in the submission of each
request.
In
.BR io_uring ,
on the other hand,
you can batch several requests in one go,
simply by queueing up multiple SQEs,
each describing an I/O operation you want and make a single call to
.BR io_uring_enter (2).
This is possible due to
.BR io_uring 's
shared buffers based design.
.PP
While this batching in itself can avoid the overhead associated with
potentially multiple and frequent system calls,
you can reduce even this overhead further with Submission Queue Polling,
by having the kernel poll and pick up your SQEs for processing as you add
them to the submission queue. This avoids the
.BR io_uring_enter (2)
call you need to make to tell the kernel to pick SQEs up.
For high-performance applications,
this means even fewer system call overheads.
.SH CONFORMING TO
.B io_uring
is Linux-specific.
.SH EXAMPLES
The following example uses
.B io_uring
to copy stdin to stdout.
Using shell redirection,
you should be able to copy files with this example.
Because it uses a queue depth of only one,
this example processes I/O requests one after the other.
It is purposefully kept this way to aid understanding.
In real-world scenarios however,
you'll want to have a larger queue depth to parallelize I/O request
processing so as to gain the kind of performance benefits
.B io_uring
provides with its asynchronous processing of requests.
.PP
.EX
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <linux/fs.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <stdatomic.h>
#include <linux/io_uring.h>
#define QUEUE_DEPTH 1
#define BLOCK_SZ 1024
/* Macros for barriers needed by io_uring */
#define io_uring_smp_store_release(p, v) \\
atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \\
memory_order_release)
#define io_uring_smp_load_acquire(p) \\
atomic_load_explicit((_Atomic typeof(*(p)) *)(p), \\
memory_order_acquire)
int ring_fd;
unsigned *sring_tail, *sring_mask, *sring_array,
*cring_head, *cring_tail, *cring_mask;
struct io_uring_sqe *sqes;
struct io_uring_cqe *cqes;
char buff[BLOCK_SZ];
off_t offset;
/*
* System call wrappers provided since glibc does not yet
* provide wrappers for io_uring system calls.
* */
int io_uring_setup(unsigned entries, struct io_uring_params *p)
{
return (int) syscall(__NR_io_uring_setup, entries, p);
}
int io_uring_enter(int ring_fd, unsigned int to_submit,
unsigned int min_complete, unsigned int flags)
{
return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit,
min_complete, flags, NULL, 0);
}
int app_setup_uring(void) {
struct io_uring_params p;
void *sq_ptr, *cq_ptr;
/* See io_uring_setup(2) for io_uring_params.flags you can set */
memset(&p, 0, sizeof(p));
ring_fd = io_uring_setup(QUEUE_DEPTH, &p);
if (ring_fd < 0) {
perror("io_uring_setup");
return 1;
}
/*
* io_uring communication happens via 2 shared kernel-user space ring
* buffers, which can be jointly mapped with a single mmap() call in
* kernels >= 5.4.
*/
int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
/* Rather than check for kernel version, the recommended way is to
* check the features field of the io_uring_params structure, which is a
* bitmask. If IORING_FEAT_SINGLE_MMAP is set, we can do away with the
* second mmap() call to map in the completion ring separately.
*/
if (p.features & IORING_FEAT_SINGLE_MMAP) {
if (cring_sz > sring_sz)
sring_sz = cring_sz;
cring_sz = sring_sz;
}
/* Map in the submission and completion queue ring buffers.
* Kernels < 5.4 only map in the submission queue, though.
*/
sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
ring_fd, IORING_OFF_SQ_RING);
if (sq_ptr == MAP_FAILED) {
perror("mmap");
return 1;
}
if (p.features & IORING_FEAT_SINGLE_MMAP) {
cq_ptr = sq_ptr;
} else {
/* Map in the completion queue ring buffer in older kernels separately */
cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
ring_fd, IORING_OFF_CQ_RING);
if (cq_ptr == MAP_FAILED) {
perror("mmap");
return 1;
}
}
/* Save useful fields for later easy reference */
sring_tail = sq_ptr + p.sq_off.tail;
sring_mask = sq_ptr + p.sq_off.ring_mask;
sring_array = sq_ptr + p.sq_off.array;
/* Map in the submission queue entries array */
sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
ring_fd, IORING_OFF_SQES);
if (sqes == MAP_FAILED) {
perror("mmap");
return 1;
}
/* Save useful fields for later easy reference */
cring_head = cq_ptr + p.cq_off.head;
cring_tail = cq_ptr + p.cq_off.tail;
cring_mask = cq_ptr + p.cq_off.ring_mask;
cqes = cq_ptr + p.cq_off.cqes;
return 0;
}
/*
* Read from completion queue.
* In this function, we read completion events from the completion queue.
* We dequeue the CQE, update and head and return the result of the operation.
* */
int read_from_cq() {
struct io_uring_cqe *cqe;
unsigned head;
/* Read barrier */
head = io_uring_smp_load_acquire(cring_head);
/*
* Remember, this is a ring buffer. If head == tail, it means that the
* buffer is empty.
* */
if (head == *cring_tail)
return -1;
/* Get the entry */
cqe = &cqes[head & (*cring_mask)];
if (cqe->res < 0)
fprintf(stderr, "Error: %s\\n", strerror(abs(cqe->res)));
head++;
/* Write barrier so that update to the head are made visible */
io_uring_smp_store_release(cring_head, head);
return cqe->res;
}
/*
* Submit a read or a write request to the submission queue.
* */
int submit_to_sq(int fd, int op) {
unsigned index, tail;
/* Add our submission queue entry to the tail of the SQE ring buffer */
tail = *sring_tail;
index = tail & *sring_mask;
struct io_uring_sqe *sqe = &sqes[index];
/* Fill in the parameters required for the read or write operation */
sqe->opcode = op;
sqe->fd = fd;
sqe->addr = (unsigned long) buff;
if (op == IORING_OP_READ) {
memset(buff, 0, sizeof(buff));
sqe->len = BLOCK_SZ;
}
else {
sqe->len = strlen(buff);
}
sqe->off = offset;
sring_array[index] = index;
tail++;
/* Update the tail */
io_uring_smp_store_release(sring_tail, tail);
/*
* Tell the kernel we have submitted events with the io_uring_enter()
* system call. We also pass in the IOURING_ENTER_GETEVENTS flag which
* causes the io_uring_enter() call to wait until min_complete
* (the 3rd param) events complete.
* */
int ret = io_uring_enter(ring_fd, 1,1,
IORING_ENTER_GETEVENTS);
if(ret < 0) {
perror("io_uring_enter");
return -1;
}
return ret;
}
int main(int argc, char *argv[]) {
int res;
/* Setup io_uring for use */
if(app_setup_uring()) {
fprintf(stderr, "Unable to setup uring!\\n");
return 1;
}
/*
* A while loop that reads from stdin and writes to stdout.
* Breaks on EOF.
*/
while (1) {
/* Initiate read from stdin and wait for it to complete */
submit_to_sq(STDIN_FILENO, IORING_OP_READ);
/* Read completion queue entry */
res = read_from_cq();
if (res > 0) {
/* Read successful. Write to stdout. */
submit_to_sq(STDOUT_FILENO, IORING_OP_WRITE);
read_from_cq();
} else if (res == 0) {
/* reached EOF */
break;
}
else if (res < 0) {
/* Error reading file */
fprintf(stderr, "Error: %s\\n", strerror(abs(res)));
break;
}
offset += res;
}
return 0;
}
.EE
.SH SEE ALSO
.BR io_uring_enter (2)
.BR io_uring_register (2)
.BR io_uring_setup (2)
|