Commit | Line | Data |
---|---|---|
52885fa2 | 1 | /* |
bffad86f | 2 | * io_uring engine |
52885fa2 | 3 | * |
bffad86f | 4 | * IO engine using the new native Linux aio io_uring interface. See: |
a90cd050 | 5 | * |
bffad86f | 6 | * http://git.kernel.dk/cgit/linux-block/log/?h=io_uring |
52885fa2 JA |
7 | * |
8 | */ | |
9 | #include <stdlib.h> | |
10 | #include <unistd.h> | |
11 | #include <errno.h> | |
52885fa2 JA |
12 | #include <sys/time.h> |
13 | #include <sys/resource.h> | |
14 | ||
15 | #include "../fio.h" | |
16 | #include "../lib/pow2.h" | |
17 | #include "../optgroup.h" | |
18 | #include "../lib/memalign.h" | |
b87aa01a | 19 | #include "../lib/fls.h" |
6d975f2c | 20 | #include "../lib/roundup.h" |
52885fa2 | 21 | |
bffad86f | 22 | #ifdef ARCH_HAVE_IOURING |
52885fa2 | 23 | |
57fa61f0 | 24 | #include "../lib/types.h" |
f3e769a4 | 25 | #include "../os/linux/io_uring.h" |
e9f6567a | 26 | #include "cmdprio.h" |
9a2d78b3 | 27 | |
bffad86f | 28 | struct io_sq_ring { |
e2239016 JA |
29 | unsigned *head; |
30 | unsigned *tail; | |
31 | unsigned *ring_mask; | |
32 | unsigned *ring_entries; | |
33 | unsigned *flags; | |
34 | unsigned *array; | |
52885fa2 JA |
35 | }; |
36 | ||
bffad86f | 37 | struct io_cq_ring { |
e2239016 JA |
38 | unsigned *head; |
39 | unsigned *tail; | |
40 | unsigned *ring_mask; | |
41 | unsigned *ring_entries; | |
f0403f94 | 42 | struct io_uring_cqe *cqes; |
9a2d78b3 JA |
43 | }; |
44 | ||
bffad86f | 45 | struct ioring_mmap { |
9a2d78b3 JA |
46 | void *ptr; |
47 | size_t len; | |
52885fa2 JA |
48 | }; |
49 | ||
bffad86f | 50 | struct ioring_data { |
9a2d78b3 JA |
51 | int ring_fd; |
52 | ||
52885fa2 JA |
53 | struct io_u **io_u_index; |
54 | ||
5ffd5626 JA |
55 | int *fds; |
56 | ||
bffad86f | 57 | struct io_sq_ring sq_ring; |
f0403f94 | 58 | struct io_uring_sqe *sqes; |
9a2d78b3 | 59 | struct iovec *iovecs; |
b87aa01a | 60 | unsigned sq_ring_mask; |
52885fa2 | 61 | |
bffad86f | 62 | struct io_cq_ring cq_ring; |
b87aa01a | 63 | unsigned cq_ring_mask; |
52885fa2 JA |
64 | |
65 | int queued; | |
66 | int cq_ring_off; | |
b87aa01a | 67 | unsigned iodepth; |
5a59a81d | 68 | int prepped; |
96563db9 | 69 | |
bffad86f | 70 | struct ioring_mmap mmap[3]; |
e9f6567a DLM |
71 | |
72 | bool use_cmdprio; | |
52885fa2 JA |
73 | }; |
74 | ||
bffad86f | 75 | struct ioring_options { |
a48f0cc7 | 76 | struct thread_data *td; |
52885fa2 | 77 | unsigned int hipri; |
e9f6567a | 78 | struct cmdprio cmdprio; |
52885fa2 | 79 | unsigned int fixedbufs; |
5ffd5626 | 80 | unsigned int registerfiles; |
3d7d00a3 | 81 | unsigned int sqpoll_thread; |
2ea53ca3 JA |
82 | unsigned int sqpoll_set; |
83 | unsigned int sqpoll_cpu; | |
b10b1e70 | 84 | unsigned int nonvectored; |
4a87b584 | 85 | unsigned int uncached; |
7d42e66e | 86 | unsigned int nowait; |
5a59a81d | 87 | unsigned int force_async; |
52885fa2 JA |
88 | }; |
89 | ||
b10b1e70 JA |
90 | static const int ddir_to_op[2][2] = { |
91 | { IORING_OP_READV, IORING_OP_READ }, | |
92 | { IORING_OP_WRITEV, IORING_OP_WRITE } | |
93 | }; | |
94 | ||
3f1e3af7 KB |
95 | static const int fixed_ddir_to_op[2] = { |
96 | IORING_OP_READ_FIXED, | |
97 | IORING_OP_WRITE_FIXED | |
98 | }; | |
99 | ||
2ea53ca3 | 100 | static int fio_ioring_sqpoll_cb(void *data, unsigned long long *val) |
a90cd050 | 101 | { |
bffad86f | 102 | struct ioring_options *o = data; |
a90cd050 | 103 | |
2ea53ca3 JA |
104 | o->sqpoll_cpu = *val; |
105 | o->sqpoll_set = 1; | |
a90cd050 JA |
106 | return 0; |
107 | } | |
108 | ||
a48f0cc7 DLM |
109 | static int str_cmdprio_bssplit_cb(void *data, const char *input) |
110 | { | |
111 | struct ioring_options *o = data; | |
112 | struct thread_data *td = o->td; | |
113 | struct cmdprio *cmdprio = &o->cmdprio; | |
114 | ||
115 | return fio_cmdprio_bssplit_parse(td, input, cmdprio); | |
116 | } | |
117 | ||
52885fa2 JA |
118 | static struct fio_option options[] = { |
119 | { | |
120 | .name = "hipri", | |
121 | .lname = "High Priority", | |
122 | .type = FIO_OPT_STR_SET, | |
bffad86f | 123 | .off1 = offsetof(struct ioring_options, hipri), |
52885fa2 JA |
124 | .help = "Use polled IO completions", |
125 | .category = FIO_OPT_C_ENGINE, | |
27f436d9 | 126 | .group = FIO_OPT_G_IOURING, |
52885fa2 | 127 | }, |
b2a432bf PC |
128 | #ifdef FIO_HAVE_IOPRIO_CLASS |
129 | { | |
130 | .name = "cmdprio_percentage", | |
131 | .lname = "high priority percentage", | |
132 | .type = FIO_OPT_INT, | |
e9f6567a DLM |
133 | .off1 = offsetof(struct ioring_options, |
134 | cmdprio.percentage[DDIR_READ]), | |
135 | .off2 = offsetof(struct ioring_options, | |
136 | cmdprio.percentage[DDIR_WRITE]), | |
137 | .minval = 0, | |
b2a432bf PC |
138 | .maxval = 100, |
139 | .help = "Send high priority I/O this percentage of the time", | |
140 | .category = FIO_OPT_C_ENGINE, | |
141 | .group = FIO_OPT_G_IOURING, | |
142 | }, | |
12f9d54a DLM |
143 | { |
144 | .name = "cmdprio_class", | |
145 | .lname = "Asynchronous I/O priority class", | |
146 | .type = FIO_OPT_INT, | |
147 | .off1 = offsetof(struct ioring_options, | |
148 | cmdprio.class[DDIR_READ]), | |
149 | .off2 = offsetof(struct ioring_options, | |
150 | cmdprio.class[DDIR_WRITE]), | |
151 | .help = "Set asynchronous IO priority class", | |
152 | .minval = IOPRIO_MIN_PRIO_CLASS + 1, | |
153 | .maxval = IOPRIO_MAX_PRIO_CLASS, | |
154 | .interval = 1, | |
155 | .category = FIO_OPT_C_ENGINE, | |
156 | .group = FIO_OPT_G_IOURING, | |
157 | }, | |
158 | { | |
159 | .name = "cmdprio", | |
160 | .lname = "Asynchronous I/O priority level", | |
161 | .type = FIO_OPT_INT, | |
162 | .off1 = offsetof(struct ioring_options, | |
163 | cmdprio.level[DDIR_READ]), | |
164 | .off2 = offsetof(struct ioring_options, | |
165 | cmdprio.level[DDIR_WRITE]), | |
166 | .help = "Set asynchronous IO priority level", | |
167 | .minval = IOPRIO_MIN_PRIO, | |
168 | .maxval = IOPRIO_MAX_PRIO, | |
169 | .interval = 1, | |
170 | .category = FIO_OPT_C_ENGINE, | |
171 | .group = FIO_OPT_G_IOURING, | |
172 | }, | |
a48f0cc7 DLM |
173 | { |
174 | .name = "cmdprio_bssplit", | |
175 | .lname = "Priority percentage block size split", | |
176 | .type = FIO_OPT_STR_ULL, | |
177 | .cb = str_cmdprio_bssplit_cb, | |
178 | .off1 = offsetof(struct ioring_options, cmdprio.bssplit), | |
179 | .help = "Set priority percentages for different block sizes", | |
180 | .category = FIO_OPT_C_ENGINE, | |
181 | .group = FIO_OPT_G_IOURING, | |
182 | }, | |
b2a432bf PC |
183 | #else |
184 | { | |
185 | .name = "cmdprio_percentage", | |
186 | .lname = "high priority percentage", | |
187 | .type = FIO_OPT_UNSUPPORTED, | |
188 | .help = "Your platform does not support I/O priority classes", | |
189 | }, | |
12f9d54a DLM |
190 | { |
191 | .name = "cmdprio_class", | |
192 | .lname = "Asynchronous I/O priority class", | |
193 | .type = FIO_OPT_UNSUPPORTED, | |
194 | .help = "Your platform does not support I/O priority classes", | |
195 | }, | |
196 | { | |
197 | .name = "cmdprio", | |
198 | .lname = "Asynchronous I/O priority level", | |
199 | .type = FIO_OPT_UNSUPPORTED, | |
200 | .help = "Your platform does not support I/O priority classes", | |
201 | }, | |
a48f0cc7 DLM |
202 | { |
203 | .name = "cmdprio_bssplit", | |
204 | .lname = "Priority percentage block size split", | |
205 | .type = FIO_OPT_UNSUPPORTED, | |
206 | .help = "Your platform does not support I/O priority classes", | |
207 | }, | |
b2a432bf | 208 | #endif |
52885fa2 JA |
209 | { |
210 | .name = "fixedbufs", | |
211 | .lname = "Fixed (pre-mapped) IO buffers", | |
212 | .type = FIO_OPT_STR_SET, | |
bffad86f | 213 | .off1 = offsetof(struct ioring_options, fixedbufs), |
52885fa2 JA |
214 | .help = "Pre map IO buffers", |
215 | .category = FIO_OPT_C_ENGINE, | |
27f436d9 | 216 | .group = FIO_OPT_G_IOURING, |
52885fa2 | 217 | }, |
5ffd5626 JA |
218 | { |
219 | .name = "registerfiles", | |
220 | .lname = "Register file set", | |
221 | .type = FIO_OPT_STR_SET, | |
222 | .off1 = offsetof(struct ioring_options, registerfiles), | |
223 | .help = "Pre-open/register files", | |
224 | .category = FIO_OPT_C_ENGINE, | |
27f436d9 | 225 | .group = FIO_OPT_G_IOURING, |
5ffd5626 | 226 | }, |
771c9901 JA |
227 | { |
228 | .name = "sqthread_poll", | |
3d7d00a3 JA |
229 | .lname = "Kernel SQ thread polling", |
230 | .type = FIO_OPT_INT, | |
231 | .off1 = offsetof(struct ioring_options, sqpoll_thread), | |
232 | .help = "Offload submission/completion to kernel thread", | |
233 | .category = FIO_OPT_C_ENGINE, | |
27f436d9 | 234 | .group = FIO_OPT_G_IOURING, |
3d7d00a3 JA |
235 | }, |
236 | { | |
237 | .name = "sqthread_poll_cpu", | |
238 | .lname = "SQ Thread Poll CPU", | |
2ea53ca3 JA |
239 | .type = FIO_OPT_INT, |
240 | .cb = fio_ioring_sqpoll_cb, | |
3d7d00a3 | 241 | .help = "What CPU to run SQ thread polling on", |
a90cd050 | 242 | .category = FIO_OPT_C_ENGINE, |
27f436d9 | 243 | .group = FIO_OPT_G_IOURING, |
a90cd050 | 244 | }, |
b10b1e70 JA |
245 | { |
246 | .name = "nonvectored", | |
247 | .lname = "Non-vectored", | |
248 | .type = FIO_OPT_INT, | |
249 | .off1 = offsetof(struct ioring_options, nonvectored), | |
556d8415 | 250 | .def = "-1", |
b10b1e70 JA |
251 | .help = "Use non-vectored read/write commands", |
252 | .category = FIO_OPT_C_ENGINE, | |
253 | .group = FIO_OPT_G_IOURING, | |
254 | }, | |
4a87b584 JA |
255 | { |
256 | .name = "uncached", | |
257 | .lname = "Uncached", | |
258 | .type = FIO_OPT_INT, | |
259 | .off1 = offsetof(struct ioring_options, uncached), | |
260 | .help = "Use RWF_UNCACHED for buffered read/writes", | |
261 | .category = FIO_OPT_C_ENGINE, | |
262 | .group = FIO_OPT_G_IOURING, | |
263 | }, | |
7d42e66e KK |
264 | { |
265 | .name = "nowait", | |
266 | .lname = "RWF_NOWAIT", | |
267 | .type = FIO_OPT_BOOL, | |
268 | .off1 = offsetof(struct ioring_options, nowait), | |
269 | .help = "Use RWF_NOWAIT for reads/writes", | |
270 | .category = FIO_OPT_C_ENGINE, | |
271 | .group = FIO_OPT_G_IOURING, | |
272 | }, | |
5a59a81d JA |
273 | { |
274 | .name = "force_async", | |
275 | .lname = "Force async", | |
276 | .type = FIO_OPT_INT, | |
277 | .off1 = offsetof(struct ioring_options, force_async), | |
278 | .help = "Set IOSQE_ASYNC every N requests", | |
279 | .category = FIO_OPT_C_ENGINE, | |
280 | .group = FIO_OPT_G_IOURING, | |
281 | }, | |
52885fa2 JA |
282 | { |
283 | .name = NULL, | |
284 | }, | |
285 | }; | |
286 | ||
bffad86f | 287 | static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit, |
52885fa2 JA |
288 | unsigned int min_complete, unsigned int flags) |
289 | { | |
bfed648c | 290 | return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit, |
521164fa | 291 | min_complete, flags, NULL, 0); |
52885fa2 JA |
292 | } |
293 | ||
bffad86f | 294 | static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u) |
52885fa2 | 295 | { |
bffad86f | 296 | struct ioring_data *ld = td->io_ops_data; |
cfcc8564 | 297 | struct ioring_options *o = td->eo; |
52885fa2 | 298 | struct fio_file *f = io_u->file; |
f0403f94 | 299 | struct io_uring_sqe *sqe; |
52885fa2 | 300 | |
f0403f94 | 301 | sqe = &ld->sqes[io_u->index]; |
34d6090e | 302 | |
5ffd5626 JA |
303 | if (o->registerfiles) { |
304 | sqe->fd = f->engine_pos; | |
305 | sqe->flags = IOSQE_FIXED_FILE; | |
306 | } else { | |
307 | sqe->fd = f->fd; | |
87b69ef2 | 308 | sqe->flags = 0; |
5ffd5626 | 309 | } |
52885fa2 | 310 | |
e3970057 | 311 | if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) { |
f0403f94 | 312 | if (o->fixedbufs) { |
3f1e3af7 | 313 | sqe->opcode = fixed_ddir_to_op[io_u->ddir]; |
919850d2 | 314 | sqe->addr = (unsigned long) io_u->xfer_buf; |
f0403f94 | 315 | sqe->len = io_u->xfer_buflen; |
2ea53ca3 | 316 | sqe->buf_index = io_u->index; |
cfcc8564 | 317 | } else { |
832faaaf JA |
318 | struct iovec *iov = &ld->iovecs[io_u->index]; |
319 | ||
320 | /* | |
321 | * Update based on actual io_u, requeue could have | |
322 | * adjusted these | |
323 | */ | |
324 | iov->iov_base = io_u->xfer_buf; | |
325 | iov->iov_len = io_u->xfer_buflen; | |
326 | ||
3f1e3af7 | 327 | sqe->opcode = ddir_to_op[io_u->ddir][!!o->nonvectored]; |
b10b1e70 | 328 | if (o->nonvectored) { |
832faaaf JA |
329 | sqe->addr = (unsigned long) iov->iov_base; |
330 | sqe->len = iov->iov_len; | |
b10b1e70 | 331 | } else { |
832faaaf | 332 | sqe->addr = (unsigned long) iov; |
b10b1e70 JA |
333 | sqe->len = 1; |
334 | } | |
cfcc8564 | 335 | } |
fd70e361 | 336 | sqe->rw_flags = 0; |
4a87b584 | 337 | if (!td->o.odirect && o->uncached) |
fd70e361 | 338 | sqe->rw_flags |= RWF_UNCACHED; |
7d42e66e KK |
339 | if (o->nowait) |
340 | sqe->rw_flags |= RWF_NOWAIT; | |
f0403f94 | 341 | sqe->off = io_u->offset; |
48e698fa | 342 | } else if (ddir_sync(io_u->ddir)) { |
7c70f506 | 343 | sqe->ioprio = 0; |
01387bfe AF |
344 | if (io_u->ddir == DDIR_SYNC_FILE_RANGE) { |
345 | sqe->off = f->first_write; | |
346 | sqe->len = f->last_write - f->first_write; | |
347 | sqe->sync_range_flags = td->o.sync_file_range; | |
348 | sqe->opcode = IORING_OP_SYNC_FILE_RANGE; | |
349 | } else { | |
7c70f506 JA |
350 | sqe->off = 0; |
351 | sqe->addr = 0; | |
352 | sqe->len = 0; | |
01387bfe AF |
353 | if (io_u->ddir == DDIR_DATASYNC) |
354 | sqe->fsync_flags |= IORING_FSYNC_DATASYNC; | |
355 | sqe->opcode = IORING_OP_FSYNC; | |
356 | } | |
48e698fa | 357 | } |
52885fa2 | 358 | |
5a59a81d JA |
359 | if (o->force_async && ++ld->prepped == o->force_async) { |
360 | ld->prepped = 0; | |
361 | sqe->flags |= IOSQE_ASYNC; | |
362 | } | |
363 | ||
48e698fa | 364 | sqe->user_data = (unsigned long) io_u; |
52885fa2 JA |
365 | return 0; |
366 | } | |
367 | ||
bffad86f | 368 | static struct io_u *fio_ioring_event(struct thread_data *td, int event) |
52885fa2 | 369 | { |
bffad86f | 370 | struct ioring_data *ld = td->io_ops_data; |
f0403f94 | 371 | struct io_uring_cqe *cqe; |
52885fa2 | 372 | struct io_u *io_u; |
b87aa01a | 373 | unsigned index; |
52885fa2 | 374 | |
b87aa01a | 375 | index = (event + ld->cq_ring_off) & ld->cq_ring_mask; |
52885fa2 | 376 | |
f0403f94 | 377 | cqe = &ld->cq_ring.cqes[index]; |
e3466352 | 378 | io_u = (struct io_u *) (uintptr_t) cqe->user_data; |
52885fa2 | 379 | |
f0403f94 JA |
380 | if (cqe->res != io_u->xfer_buflen) { |
381 | if (cqe->res > io_u->xfer_buflen) | |
382 | io_u->error = -cqe->res; | |
52885fa2 | 383 | else |
f0403f94 | 384 | io_u->resid = io_u->xfer_buflen - cqe->res; |
52885fa2 JA |
385 | } else |
386 | io_u->error = 0; | |
387 | ||
388 | return io_u; | |
389 | } | |
390 | ||
bffad86f | 391 | static int fio_ioring_cqring_reap(struct thread_data *td, unsigned int events, |
52885fa2 JA |
392 | unsigned int max) |
393 | { | |
bffad86f JA |
394 | struct ioring_data *ld = td->io_ops_data; |
395 | struct io_cq_ring *ring = &ld->cq_ring; | |
e2239016 | 396 | unsigned head, reaped = 0; |
52885fa2 | 397 | |
9a2d78b3 | 398 | head = *ring->head; |
52885fa2 | 399 | do { |
9e26aff9 | 400 | if (head == atomic_load_acquire(ring->tail)) |
52885fa2 JA |
401 | break; |
402 | reaped++; | |
403 | head++; | |
52885fa2 JA |
404 | } while (reaped + events < max); |
405 | ||
76ce63dd AB |
406 | if (reaped) |
407 | atomic_store_release(ring->head, head); | |
408 | ||
52885fa2 JA |
409 | return reaped; |
410 | } | |
411 | ||
bffad86f JA |
412 | static int fio_ioring_getevents(struct thread_data *td, unsigned int min, |
413 | unsigned int max, const struct timespec *t) | |
52885fa2 | 414 | { |
bffad86f | 415 | struct ioring_data *ld = td->io_ops_data; |
52885fa2 | 416 | unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min; |
bffad86f JA |
417 | struct ioring_options *o = td->eo; |
418 | struct io_cq_ring *ring = &ld->cq_ring; | |
b87aa01a JA |
419 | unsigned events = 0; |
420 | int r; | |
52885fa2 | 421 | |
9a2d78b3 | 422 | ld->cq_ring_off = *ring->head; |
52885fa2 | 423 | do { |
bffad86f | 424 | r = fio_ioring_cqring_reap(td, events, max); |
52885fa2 JA |
425 | if (r) { |
426 | events += r; | |
f7cbbbf8 ST |
427 | if (actual_min != 0) |
428 | actual_min -= r; | |
52885fa2 JA |
429 | continue; |
430 | } | |
431 | ||
3d7d00a3 | 432 | if (!o->sqpoll_thread) { |
9a2d78b3 JA |
433 | r = io_uring_enter(ld, 0, actual_min, |
434 | IORING_ENTER_GETEVENTS); | |
771c9901 | 435 | if (r < 0) { |
f6abd731 | 436 | if (errno == EAGAIN || errno == EINTR) |
771c9901 | 437 | continue; |
9a2d78b3 | 438 | td_verror(td, errno, "io_uring_enter"); |
771c9901 JA |
439 | break; |
440 | } | |
52885fa2 JA |
441 | } |
442 | } while (events < min); | |
443 | ||
444 | return r < 0 ? r : events; | |
445 | } | |
446 | ||
b2a432bf PC |
447 | static void fio_ioring_prio_prep(struct thread_data *td, struct io_u *io_u) |
448 | { | |
449 | struct ioring_options *o = td->eo; | |
450 | struct ioring_data *ld = td->io_ops_data; | |
e9f6567a DLM |
451 | struct io_uring_sqe *sqe = &ld->sqes[io_u->index]; |
452 | struct cmdprio *cmdprio = &o->cmdprio; | |
12f9d54a | 453 | enum fio_ddir ddir = io_u->ddir; |
a48f0cc7 | 454 | unsigned int p = fio_cmdprio_percentage(cmdprio, io_u); |
1437d635 DLM |
455 | unsigned int cmdprio_value = |
456 | ioprio_value(cmdprio->class[ddir], cmdprio->level[ddir]); | |
e9f6567a DLM |
457 | |
458 | if (p && rand_between(&td->prio_state, 0, 99) < p) { | |
1437d635 DLM |
459 | sqe->ioprio = cmdprio_value; |
460 | if (!td->ioprio || cmdprio_value < td->ioprio) { | |
461 | /* | |
462 | * The async IO priority is higher (has a lower value) | |
463 | * than the priority set by "prio" and "prioclass" | |
464 | * options. | |
465 | */ | |
03ec570f | 466 | io_u->flags |= IO_U_F_HIGH_PRIO; |
1437d635 | 467 | } |
ff9b6876 | 468 | } else { |
1437d635 DLM |
469 | sqe->ioprio = td->ioprio; |
470 | if (cmdprio_value && td->ioprio && td->ioprio < cmdprio_value) { | |
471 | /* | |
472 | * The IO will be executed with the priority set by | |
473 | * "prio" and "prioclass" options, and this priority | |
474 | * is higher (has a lower value) than the async IO | |
475 | * priority. | |
476 | */ | |
03ec570f | 477 | io_u->flags |= IO_U_F_HIGH_PRIO; |
1437d635 | 478 | } |
b2a432bf | 479 | } |
03ec570f DLM |
480 | |
481 | io_u->ioprio = sqe->ioprio; | |
b2a432bf PC |
482 | } |
483 | ||
bffad86f JA |
484 | static enum fio_q_status fio_ioring_queue(struct thread_data *td, |
485 | struct io_u *io_u) | |
52885fa2 | 486 | { |
bffad86f JA |
487 | struct ioring_data *ld = td->io_ops_data; |
488 | struct io_sq_ring *ring = &ld->sq_ring; | |
52885fa2 JA |
489 | unsigned tail, next_tail; |
490 | ||
491 | fio_ro_check(td, io_u); | |
492 | ||
b87aa01a | 493 | if (ld->queued == ld->iodepth) |
52885fa2 JA |
494 | return FIO_Q_BUSY; |
495 | ||
52885fa2 JA |
496 | if (io_u->ddir == DDIR_TRIM) { |
497 | if (ld->queued) | |
498 | return FIO_Q_BUSY; | |
499 | ||
500 | do_io_u_trim(td, io_u); | |
501 | io_u_mark_submit(td, 1); | |
502 | io_u_mark_complete(td, 1); | |
503 | return FIO_Q_COMPLETED; | |
504 | } | |
505 | ||
9a2d78b3 | 506 | tail = *ring->tail; |
52885fa2 | 507 | next_tail = tail + 1; |
9e26aff9 | 508 | if (next_tail == atomic_load_acquire(ring->head)) |
52885fa2 JA |
509 | return FIO_Q_BUSY; |
510 | ||
e9f6567a | 511 | if (ld->use_cmdprio) |
b2a432bf | 512 | fio_ioring_prio_prep(td, io_u); |
b87aa01a | 513 | ring->array[tail & ld->sq_ring_mask] = io_u->index; |
9e26aff9 | 514 | atomic_store_release(ring->tail, next_tail); |
52885fa2 JA |
515 | |
516 | ld->queued++; | |
517 | return FIO_Q_QUEUED; | |
518 | } | |
519 | ||
bffad86f | 520 | static void fio_ioring_queued(struct thread_data *td, int start, int nr) |
52885fa2 | 521 | { |
bffad86f | 522 | struct ioring_data *ld = td->io_ops_data; |
52885fa2 JA |
523 | struct timespec now; |
524 | ||
525 | if (!fio_fill_issue_time(td)) | |
526 | return; | |
527 | ||
528 | fio_gettime(&now, NULL); | |
529 | ||
530 | while (nr--) { | |
bffad86f | 531 | struct io_sq_ring *ring = &ld->sq_ring; |
9a2d78b3 | 532 | int index = ring->array[start & ld->sq_ring_mask]; |
f8289afc | 533 | struct io_u *io_u = ld->io_u_index[index]; |
52885fa2 JA |
534 | |
535 | memcpy(&io_u->issue_time, &now, sizeof(now)); | |
536 | io_u_queued(td, io_u); | |
537 | ||
538 | start++; | |
52885fa2 JA |
539 | } |
540 | } | |
541 | ||
bffad86f | 542 | static int fio_ioring_commit(struct thread_data *td) |
52885fa2 | 543 | { |
bffad86f JA |
544 | struct ioring_data *ld = td->io_ops_data; |
545 | struct ioring_options *o = td->eo; | |
52885fa2 JA |
546 | int ret; |
547 | ||
548 | if (!ld->queued) | |
549 | return 0; | |
550 | ||
3d7d00a3 JA |
551 | /* |
552 | * Kernel side does submission. just need to check if the ring is | |
553 | * flagged as needing a kick, if so, call io_uring_enter(). This | |
554 | * only happens if we've been idle too long. | |
555 | */ | |
556 | if (o->sqpoll_thread) { | |
bffad86f | 557 | struct io_sq_ring *ring = &ld->sq_ring; |
2dd96cc4 | 558 | unsigned flags; |
4cdbc048 | 559 | |
2dd96cc4 JA |
560 | flags = atomic_load_acquire(ring->flags); |
561 | if (flags & IORING_SQ_NEED_WAKEUP) | |
b532dd6d JA |
562 | io_uring_enter(ld, ld->queued, 0, |
563 | IORING_ENTER_SQ_WAKEUP); | |
771c9901 JA |
564 | ld->queued = 0; |
565 | return 0; | |
566 | } | |
567 | ||
52885fa2 | 568 | do { |
9a2d78b3 | 569 | unsigned start = *ld->sq_ring.head; |
52885fa2 JA |
570 | long nr = ld->queued; |
571 | ||
9a2d78b3 | 572 | ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS); |
52885fa2 | 573 | if (ret > 0) { |
bffad86f | 574 | fio_ioring_queued(td, start, ret); |
52885fa2 JA |
575 | io_u_mark_submit(td, ret); |
576 | ||
577 | ld->queued -= ret; | |
578 | ret = 0; | |
a90cd050 JA |
579 | } else if (!ret) { |
580 | io_u_mark_submit(td, ret); | |
52885fa2 | 581 | continue; |
a90cd050 | 582 | } else { |
f6abd731 | 583 | if (errno == EAGAIN || errno == EINTR) { |
bffad86f | 584 | ret = fio_ioring_cqring_reap(td, 0, ld->queued); |
a90cd050 JA |
585 | if (ret) |
586 | continue; | |
587 | /* Shouldn't happen */ | |
588 | usleep(1); | |
589 | continue; | |
52885fa2 | 590 | } |
9a2d78b3 | 591 | td_verror(td, errno, "io_uring_enter submit"); |
52885fa2 | 592 | break; |
a90cd050 | 593 | } |
52885fa2 JA |
594 | } while (ld->queued); |
595 | ||
596 | return ret; | |
597 | } | |
598 | ||
bffad86f | 599 | static void fio_ioring_unmap(struct ioring_data *ld) |
52885fa2 | 600 | { |
9a2d78b3 | 601 | int i; |
52885fa2 | 602 | |
59f94d26 | 603 | for (i = 0; i < FIO_ARRAY_SIZE(ld->mmap); i++) |
9a2d78b3 JA |
604 | munmap(ld->mmap[i].ptr, ld->mmap[i].len); |
605 | close(ld->ring_fd); | |
b87aa01a JA |
606 | } |
607 | ||
bffad86f | 608 | static void fio_ioring_cleanup(struct thread_data *td) |
52885fa2 | 609 | { |
bffad86f | 610 | struct ioring_data *ld = td->io_ops_data; |
52885fa2 JA |
611 | |
612 | if (ld) { | |
52885fa2 | 613 | if (!(td->flags & TD_F_CHILD)) |
bffad86f | 614 | fio_ioring_unmap(ld); |
9a2d78b3 | 615 | |
52885fa2 | 616 | free(ld->io_u_index); |
9a2d78b3 | 617 | free(ld->iovecs); |
5ffd5626 | 618 | free(ld->fds); |
52885fa2 JA |
619 | free(ld); |
620 | } | |
621 | } | |
622 | ||
bffad86f | 623 | static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p) |
9a2d78b3 | 624 | { |
bffad86f JA |
625 | struct io_sq_ring *sring = &ld->sq_ring; |
626 | struct io_cq_ring *cring = &ld->cq_ring; | |
9a2d78b3 JA |
627 | void *ptr; |
628 | ||
e2239016 | 629 | ld->mmap[0].len = p->sq_off.array + p->sq_entries * sizeof(__u32); |
9a2d78b3 JA |
630 | ptr = mmap(0, ld->mmap[0].len, PROT_READ | PROT_WRITE, |
631 | MAP_SHARED | MAP_POPULATE, ld->ring_fd, | |
632 | IORING_OFF_SQ_RING); | |
633 | ld->mmap[0].ptr = ptr; | |
634 | sring->head = ptr + p->sq_off.head; | |
635 | sring->tail = ptr + p->sq_off.tail; | |
636 | sring->ring_mask = ptr + p->sq_off.ring_mask; | |
637 | sring->ring_entries = ptr + p->sq_off.ring_entries; | |
638 | sring->flags = ptr + p->sq_off.flags; | |
ac122fea | 639 | sring->array = ptr + p->sq_off.array; |
9a2d78b3 JA |
640 | ld->sq_ring_mask = *sring->ring_mask; |
641 | ||
f0403f94 JA |
642 | ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe); |
643 | ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE, | |
9a2d78b3 | 644 | MAP_SHARED | MAP_POPULATE, ld->ring_fd, |
f0403f94 JA |
645 | IORING_OFF_SQES); |
646 | ld->mmap[1].ptr = ld->sqes; | |
9a2d78b3 | 647 | |
f0403f94 JA |
648 | ld->mmap[2].len = p->cq_off.cqes + |
649 | p->cq_entries * sizeof(struct io_uring_cqe); | |
9a2d78b3 JA |
650 | ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE, |
651 | MAP_SHARED | MAP_POPULATE, ld->ring_fd, | |
652 | IORING_OFF_CQ_RING); | |
653 | ld->mmap[2].ptr = ptr; | |
654 | cring->head = ptr + p->cq_off.head; | |
655 | cring->tail = ptr + p->cq_off.tail; | |
656 | cring->ring_mask = ptr + p->cq_off.ring_mask; | |
657 | cring->ring_entries = ptr + p->cq_off.ring_entries; | |
f0403f94 | 658 | cring->cqes = ptr + p->cq_off.cqes; |
9a2d78b3 JA |
659 | ld->cq_ring_mask = *cring->ring_mask; |
660 | return 0; | |
661 | } | |
662 | ||
556d8415 JA |
663 | static void fio_ioring_probe(struct thread_data *td) |
664 | { | |
665 | struct ioring_data *ld = td->io_ops_data; | |
666 | struct ioring_options *o = td->eo; | |
667 | struct io_uring_probe *p; | |
668 | int ret; | |
669 | ||
670 | /* already set by user, don't touch */ | |
671 | if (o->nonvectored != -1) | |
672 | return; | |
673 | ||
674 | /* default to off, as that's always safe */ | |
675 | o->nonvectored = 0; | |
676 | ||
677 | p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op)); | |
678 | if (!p) | |
679 | return; | |
680 | ||
681 | memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op)); | |
682 | ret = syscall(__NR_io_uring_register, ld->ring_fd, | |
683 | IORING_REGISTER_PROBE, p, 256); | |
684 | if (ret < 0) | |
685 | goto out; | |
686 | ||
687 | if (IORING_OP_WRITE > p->ops_len) | |
688 | goto out; | |
689 | ||
690 | if ((p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED) && | |
691 | (p->ops[IORING_OP_WRITE].flags & IO_URING_OP_SUPPORTED)) | |
692 | o->nonvectored = 1; | |
693 | out: | |
694 | free(p); | |
695 | } | |
696 | ||
bffad86f | 697 | static int fio_ioring_queue_init(struct thread_data *td) |
52885fa2 | 698 | { |
bffad86f JA |
699 | struct ioring_data *ld = td->io_ops_data; |
700 | struct ioring_options *o = td->eo; | |
52885fa2 | 701 | int depth = td->o.iodepth; |
bffad86f | 702 | struct io_uring_params p; |
9a2d78b3 JA |
703 | int ret; |
704 | ||
705 | memset(&p, 0, sizeof(p)); | |
52885fa2 JA |
706 | |
707 | if (o->hipri) | |
bffad86f | 708 | p.flags |= IORING_SETUP_IOPOLL; |
3d7d00a3 JA |
709 | if (o->sqpoll_thread) { |
710 | p.flags |= IORING_SETUP_SQPOLL; | |
711 | if (o->sqpoll_set) { | |
712 | p.flags |= IORING_SETUP_SQ_AFF; | |
713 | p.sq_thread_cpu = o->sqpoll_cpu; | |
714 | } | |
f635f1fb | 715 | } |
a90cd050 | 716 | |
bfed648c | 717 | ret = syscall(__NR_io_uring_setup, depth, &p); |
9a2d78b3 JA |
718 | if (ret < 0) |
719 | return ret; | |
720 | ||
721 | ld->ring_fd = ret; | |
2ea53ca3 | 722 | |
556d8415 JA |
723 | fio_ioring_probe(td); |
724 | ||
2ea53ca3 | 725 | if (o->fixedbufs) { |
bfed648c | 726 | ret = syscall(__NR_io_uring_register, ld->ring_fd, |
919850d2 | 727 | IORING_REGISTER_BUFFERS, ld->iovecs, depth); |
2ea53ca3 JA |
728 | if (ret < 0) |
729 | return ret; | |
730 | } | |
731 | ||
bffad86f | 732 | return fio_ioring_mmap(ld, &p); |
52885fa2 JA |
733 | } |
734 | ||
5ffd5626 JA |
735 | static int fio_ioring_register_files(struct thread_data *td) |
736 | { | |
737 | struct ioring_data *ld = td->io_ops_data; | |
738 | struct fio_file *f; | |
739 | unsigned int i; | |
740 | int ret; | |
741 | ||
742 | ld->fds = calloc(td->o.nr_files, sizeof(int)); | |
743 | ||
744 | for_each_file(td, f, i) { | |
745 | ret = generic_open_file(td, f); | |
746 | if (ret) | |
747 | goto err; | |
748 | ld->fds[i] = f->fd; | |
749 | f->engine_pos = i; | |
750 | } | |
751 | ||
bfed648c | 752 | ret = syscall(__NR_io_uring_register, ld->ring_fd, |
5ffd5626 JA |
753 | IORING_REGISTER_FILES, ld->fds, td->o.nr_files); |
754 | if (ret) { | |
755 | err: | |
756 | free(ld->fds); | |
757 | ld->fds = NULL; | |
758 | } | |
759 | ||
760 | /* | |
761 | * Pretend the file is closed again, and really close it if we hit | |
762 | * an error. | |
763 | */ | |
764 | for_each_file(td, f, i) { | |
765 | if (ret) { | |
766 | int fio_unused ret2; | |
767 | ret2 = generic_close_file(td, f); | |
768 | } else | |
769 | f->fd = -1; | |
770 | } | |
771 | ||
772 | return ret; | |
773 | } | |
774 | ||
bffad86f | 775 | static int fio_ioring_post_init(struct thread_data *td) |
52885fa2 | 776 | { |
bffad86f | 777 | struct ioring_data *ld = td->io_ops_data; |
5ffd5626 | 778 | struct ioring_options *o = td->eo; |
52885fa2 | 779 | struct io_u *io_u; |
650346e1 | 780 | int err, i; |
52885fa2 | 781 | |
650346e1 JA |
782 | for (i = 0; i < td->o.iodepth; i++) { |
783 | struct iovec *iov = &ld->iovecs[i]; | |
9a2d78b3 | 784 | |
650346e1 JA |
785 | io_u = ld->io_u_index[i]; |
786 | iov->iov_base = io_u->buf; | |
787 | iov->iov_len = td_max_bs(td); | |
52885fa2 JA |
788 | } |
789 | ||
bffad86f | 790 | err = fio_ioring_queue_init(td); |
52885fa2 | 791 | if (err) { |
0442b53f | 792 | int init_err = errno; |
c4f5c92f | 793 | |
0442b53f | 794 | if (init_err == ENOSYS) |
c4f5c92f | 795 | log_err("fio: your kernel doesn't support io_uring\n"); |
0442b53f | 796 | td_verror(td, init_err, "io_queue_init"); |
52885fa2 JA |
797 | return 1; |
798 | } | |
799 | ||
7c70f506 JA |
800 | for (i = 0; i < td->o.iodepth; i++) { |
801 | struct io_uring_sqe *sqe; | |
802 | ||
803 | sqe = &ld->sqes[i]; | |
804 | memset(sqe, 0, sizeof(*sqe)); | |
805 | } | |
806 | ||
5ffd5626 JA |
807 | if (o->registerfiles) { |
808 | err = fio_ioring_register_files(td); | |
809 | if (err) { | |
810 | td_verror(td, errno, "ioring_register_files"); | |
811 | return 1; | |
812 | } | |
813 | } | |
814 | ||
52885fa2 JA |
815 | return 0; |
816 | } | |
817 | ||
bffad86f | 818 | static int fio_ioring_init(struct thread_data *td) |
52885fa2 | 819 | { |
5ffd5626 | 820 | struct ioring_options *o = td->eo; |
bffad86f | 821 | struct ioring_data *ld; |
e9f6567a | 822 | struct cmdprio *cmdprio = &o->cmdprio; |
1437d635 | 823 | bool has_cmdprio = false; |
e9f6567a | 824 | int ret; |
52885fa2 | 825 | |
5ffd5626 JA |
826 | /* sqthread submission requires registered files */ |
827 | if (o->sqpoll_thread) | |
828 | o->registerfiles = 1; | |
829 | ||
830 | if (o->registerfiles && td->o.nr_files != td->o.open_files) { | |
831 | log_err("fio: io_uring registered files require nr_files to " | |
832 | "be identical to open_files\n"); | |
833 | return 1; | |
834 | } | |
835 | ||
52885fa2 JA |
836 | ld = calloc(1, sizeof(*ld)); |
837 | ||
b87aa01a JA |
838 | /* ring depth must be a power-of-2 */ |
839 | ld->iodepth = td->o.iodepth; | |
840 | td->o.iodepth = roundup_pow2(td->o.iodepth); | |
841 | ||
52885fa2 JA |
842 | /* io_u index */ |
843 | ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *)); | |
650346e1 | 844 | ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec)); |
52885fa2 JA |
845 | |
846 | td->io_ops_data = ld; | |
b2a432bf | 847 | |
1437d635 | 848 | ret = fio_cmdprio_init(td, cmdprio, &has_cmdprio); |
e9f6567a DLM |
849 | if (ret) { |
850 | td_verror(td, EINVAL, "fio_ioring_init"); | |
b2a432bf PC |
851 | return 1; |
852 | } | |
1af44196 | 853 | |
1437d635 DLM |
854 | /* |
855 | * Since io_uring can have a submission context (sqthread_poll) that is | |
856 | * different from the process context, we cannot rely on the the IO | |
857 | * priority set by ioprio_set() (option prio/prioclass) to be inherited. | |
858 | * Therefore, we set the sqe->ioprio field when prio/prioclass is used. | |
859 | */ | |
860 | ld->use_cmdprio = has_cmdprio || | |
861 | fio_option_is_set(&td->o, ioprio_class) || | |
862 | fio_option_is_set(&td->o, ioprio); | |
1af44196 | 863 | |
52885fa2 JA |
864 | return 0; |
865 | } | |
866 | ||
bffad86f | 867 | static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u) |
52885fa2 | 868 | { |
bffad86f | 869 | struct ioring_data *ld = td->io_ops_data; |
52885fa2 JA |
870 | |
871 | ld->io_u_index[io_u->index] = io_u; | |
872 | return 0; | |
873 | } | |
874 | ||
5ffd5626 JA |
875 | static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f) |
876 | { | |
877 | struct ioring_data *ld = td->io_ops_data; | |
878 | struct ioring_options *o = td->eo; | |
879 | ||
17318cf6 | 880 | if (!ld || !o->registerfiles) |
5ffd5626 JA |
881 | return generic_open_file(td, f); |
882 | ||
883 | f->fd = ld->fds[f->engine_pos]; | |
884 | return 0; | |
885 | } | |
886 | ||
887 | static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f) | |
888 | { | |
17318cf6 | 889 | struct ioring_data *ld = td->io_ops_data; |
5ffd5626 JA |
890 | struct ioring_options *o = td->eo; |
891 | ||
17318cf6 | 892 | if (!ld || !o->registerfiles) |
5ffd5626 JA |
893 | return generic_close_file(td, f); |
894 | ||
895 | f->fd = -1; | |
896 | return 0; | |
897 | } | |
898 | ||
52885fa2 | 899 | static struct ioengine_ops ioengine = { |
bffad86f | 900 | .name = "io_uring", |
52885fa2 | 901 | .version = FIO_IOOPS_VERSION, |
8bfe330e | 902 | .flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD, |
bffad86f JA |
903 | .init = fio_ioring_init, |
904 | .post_init = fio_ioring_post_init, | |
905 | .io_u_init = fio_ioring_io_u_init, | |
906 | .prep = fio_ioring_prep, | |
907 | .queue = fio_ioring_queue, | |
908 | .commit = fio_ioring_commit, | |
909 | .getevents = fio_ioring_getevents, | |
910 | .event = fio_ioring_event, | |
911 | .cleanup = fio_ioring_cleanup, | |
5ffd5626 JA |
912 | .open_file = fio_ioring_open_file, |
913 | .close_file = fio_ioring_close_file, | |
52885fa2 JA |
914 | .get_file_size = generic_get_file_size, |
915 | .options = options, | |
bffad86f | 916 | .option_struct_size = sizeof(struct ioring_options), |
52885fa2 JA |
917 | }; |
918 | ||
bffad86f | 919 | static void fio_init fio_ioring_register(void) |
52885fa2 | 920 | { |
52885fa2 | 921 | register_ioengine(&ioengine); |
52885fa2 JA |
922 | } |
923 | ||
bffad86f | 924 | static void fio_exit fio_ioring_unregister(void) |
52885fa2 | 925 | { |
52885fa2 | 926 | unregister_ioengine(&ioengine); |
52885fa2 | 927 | } |
1f90e9bb | 928 | #endif |