io_uring: use fget/fput_many() for file references
[linux-2.6-block.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side. When the application reads the CQ ring
8 * tail, it must use an appropriate smp_rmb() to order with the smp_wmb()
9 * the kernel uses after writing the tail. Failure to do so could cause a
10 * delay in when the application notices that completion events available.
11 * This isn't a fatal condition. Likewise, the application must use an
12 * appropriate smp_wmb() both before writing the SQ tail, and after writing
13 * the SQ tail. The first one orders the sqe writes with the tail write, and
14 * the latter is paired with the smp_rmb() the kernel will issue before
15 * reading the SQ tail on submission.
16 *
17 * Also see the examples in the liburing library:
18 *
19 * git://git.kernel.dk/liburing
20 *
21 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
22 * from data shared between the kernel and application. This is done both
23 * for ordering purposes, but also to ensure that once a value is loaded from
24 * data that the application could potentially modify, it remains stable.
25 *
26 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 27 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
28 */
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/errno.h>
32#include <linux/syscalls.h>
33#include <linux/compat.h>
34#include <linux/refcount.h>
35#include <linux/uio.h>
36
37#include <linux/sched/signal.h>
38#include <linux/fs.h>
39#include <linux/file.h>
40#include <linux/fdtable.h>
41#include <linux/mm.h>
42#include <linux/mman.h>
43#include <linux/mmu_context.h>
44#include <linux/percpu.h>
45#include <linux/slab.h>
46#include <linux/workqueue.h>
47#include <linux/blkdev.h>
48#include <linux/net.h>
49#include <net/sock.h>
50#include <net/af_unix.h>
51#include <linux/anon_inodes.h>
52#include <linux/sched/mm.h>
53#include <linux/uaccess.h>
54#include <linux/nospec.h>
55
56#include <uapi/linux/io_uring.h>
57
58#include "internal.h"
59
60#define IORING_MAX_ENTRIES 4096
61
62struct io_uring {
63 u32 head ____cacheline_aligned_in_smp;
64 u32 tail ____cacheline_aligned_in_smp;
65};
66
67struct io_sq_ring {
68 struct io_uring r;
69 u32 ring_mask;
70 u32 ring_entries;
71 u32 dropped;
72 u32 flags;
73 u32 array[];
74};
75
76struct io_cq_ring {
77 struct io_uring r;
78 u32 ring_mask;
79 u32 ring_entries;
80 u32 overflow;
81 struct io_uring_cqe cqes[];
82};
83
84struct io_ring_ctx {
85 struct {
86 struct percpu_ref refs;
87 } ____cacheline_aligned_in_smp;
88
89 struct {
90 unsigned int flags;
91 bool compat;
92 bool account_mem;
93
94 /* SQ ring */
95 struct io_sq_ring *sq_ring;
96 unsigned cached_sq_head;
97 unsigned sq_entries;
98 unsigned sq_mask;
99 struct io_uring_sqe *sq_sqes;
100 } ____cacheline_aligned_in_smp;
101
102 /* IO offload */
103 struct workqueue_struct *sqo_wq;
104 struct mm_struct *sqo_mm;
105
106 struct {
107 /* CQ ring */
108 struct io_cq_ring *cq_ring;
109 unsigned cached_cq_tail;
110 unsigned cq_entries;
111 unsigned cq_mask;
112 struct wait_queue_head cq_wait;
113 struct fasync_struct *cq_fasync;
114 } ____cacheline_aligned_in_smp;
115
116 struct user_struct *user;
117
118 struct completion ctx_done;
119
120 struct {
121 struct mutex uring_lock;
122 wait_queue_head_t wait;
123 } ____cacheline_aligned_in_smp;
124
125 struct {
126 spinlock_t completion_lock;
def596e9
JA
127 bool poll_multi_file;
128 /*
129 * ->poll_list is protected by the ctx->uring_lock for
130 * io_uring instances that don't use IORING_SETUP_SQPOLL.
131 * For SQPOLL, only the single threaded io_sq_thread() will
132 * manipulate the list, hence no extra locking is needed there.
133 */
134 struct list_head poll_list;
2b188cc1
JA
135 } ____cacheline_aligned_in_smp;
136
137#if defined(CONFIG_UNIX)
138 struct socket *ring_sock;
139#endif
140};
141
142struct sqe_submit {
143 const struct io_uring_sqe *sqe;
144 unsigned short index;
145 bool has_user;
def596e9 146 bool needs_lock;
2b188cc1
JA
147};
148
149struct io_kiocb {
150 struct kiocb rw;
151
152 struct sqe_submit submit;
153
154 struct io_ring_ctx *ctx;
155 struct list_head list;
156 unsigned int flags;
157#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */
def596e9 158#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
2b188cc1 159 u64 user_data;
def596e9 160 u64 error;
2b188cc1
JA
161
162 struct work_struct work;
163};
164
165#define IO_PLUG_THRESHOLD 2
def596e9 166#define IO_IOPOLL_BATCH 8
2b188cc1 167
9a56a232
JA
168struct io_submit_state {
169 struct blk_plug plug;
170
171 /*
172 * File reference cache
173 */
174 struct file *file;
175 unsigned int fd;
176 unsigned int has_refs;
177 unsigned int used_refs;
178 unsigned int ios_left;
179};
180
2b188cc1
JA
181static struct kmem_cache *req_cachep;
182
183static const struct file_operations io_uring_fops;
184
185struct sock *io_uring_get_socket(struct file *file)
186{
187#if defined(CONFIG_UNIX)
188 if (file->f_op == &io_uring_fops) {
189 struct io_ring_ctx *ctx = file->private_data;
190
191 return ctx->ring_sock->sk;
192 }
193#endif
194 return NULL;
195}
196EXPORT_SYMBOL(io_uring_get_socket);
197
198static void io_ring_ctx_ref_free(struct percpu_ref *ref)
199{
200 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
201
202 complete(&ctx->ctx_done);
203}
204
205static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
206{
207 struct io_ring_ctx *ctx;
208
209 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
210 if (!ctx)
211 return NULL;
212
213 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
214 kfree(ctx);
215 return NULL;
216 }
217
218 ctx->flags = p->flags;
219 init_waitqueue_head(&ctx->cq_wait);
220 init_completion(&ctx->ctx_done);
221 mutex_init(&ctx->uring_lock);
222 init_waitqueue_head(&ctx->wait);
223 spin_lock_init(&ctx->completion_lock);
def596e9 224 INIT_LIST_HEAD(&ctx->poll_list);
2b188cc1
JA
225 return ctx;
226}
227
228static void io_commit_cqring(struct io_ring_ctx *ctx)
229{
230 struct io_cq_ring *ring = ctx->cq_ring;
231
232 if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
233 /* order cqe stores with ring update */
234 smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
235
236 /*
237 * Write sider barrier of tail update, app has read side. See
238 * comment at the top of this file.
239 */
240 smp_wmb();
241
242 if (wq_has_sleeper(&ctx->cq_wait)) {
243 wake_up_interruptible(&ctx->cq_wait);
244 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
245 }
246 }
247}
248
249static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
250{
251 struct io_cq_ring *ring = ctx->cq_ring;
252 unsigned tail;
253
254 tail = ctx->cached_cq_tail;
255 /* See comment at the top of the file */
256 smp_rmb();
257 if (tail + 1 == READ_ONCE(ring->r.head))
258 return NULL;
259
260 ctx->cached_cq_tail++;
261 return &ring->cqes[tail & ctx->cq_mask];
262}
263
264static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
265 long res, unsigned ev_flags)
266{
267 struct io_uring_cqe *cqe;
268
269 /*
270 * If we can't get a cq entry, userspace overflowed the
271 * submission (by quite a lot). Increment the overflow count in
272 * the ring.
273 */
274 cqe = io_get_cqring(ctx);
275 if (cqe) {
276 WRITE_ONCE(cqe->user_data, ki_user_data);
277 WRITE_ONCE(cqe->res, res);
278 WRITE_ONCE(cqe->flags, ev_flags);
279 } else {
280 unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
281
282 WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
283 }
284}
285
286static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
287 long res, unsigned ev_flags)
288{
289 unsigned long flags;
290
291 spin_lock_irqsave(&ctx->completion_lock, flags);
292 io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
293 io_commit_cqring(ctx);
294 spin_unlock_irqrestore(&ctx->completion_lock, flags);
295
296 if (waitqueue_active(&ctx->wait))
297 wake_up(&ctx->wait);
298}
299
300static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
301{
302 percpu_ref_put_many(&ctx->refs, refs);
303
304 if (waitqueue_active(&ctx->wait))
305 wake_up(&ctx->wait);
306}
307
308static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)
309{
310 struct io_kiocb *req;
311
312 if (!percpu_ref_tryget(&ctx->refs))
313 return NULL;
314
315 req = kmem_cache_alloc(req_cachep, __GFP_NOWARN);
316 if (req) {
317 req->ctx = ctx;
318 req->flags = 0;
319 return req;
320 }
321
322 io_ring_drop_ctx_refs(ctx, 1);
323 return NULL;
324}
325
def596e9
JA
326static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
327{
328 if (*nr) {
329 kmem_cache_free_bulk(req_cachep, *nr, reqs);
330 io_ring_drop_ctx_refs(ctx, *nr);
331 *nr = 0;
332 }
333}
334
2b188cc1
JA
335static void io_free_req(struct io_kiocb *req)
336{
337 io_ring_drop_ctx_refs(req->ctx, 1);
338 kmem_cache_free(req_cachep, req);
339}
340
def596e9
JA
341/*
342 * Find and free completed poll iocbs
343 */
344static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
345 struct list_head *done)
346{
347 void *reqs[IO_IOPOLL_BATCH];
9a56a232
JA
348 int file_count, to_free;
349 struct file *file = NULL;
def596e9 350 struct io_kiocb *req;
def596e9 351
9a56a232 352 file_count = to_free = 0;
def596e9
JA
353 while (!list_empty(done)) {
354 req = list_first_entry(done, struct io_kiocb, list);
355 list_del(&req->list);
356
357 io_cqring_fill_event(ctx, req->user_data, req->error, 0);
358
359 reqs[to_free++] = req;
360 (*nr_events)++;
361
9a56a232
JA
362 /*
363 * Batched puts of the same file, to avoid dirtying the
364 * file usage count multiple times, if avoidable.
365 */
366 if (!file) {
367 file = req->rw.ki_filp;
368 file_count = 1;
369 } else if (file == req->rw.ki_filp) {
370 file_count++;
371 } else {
372 fput_many(file, file_count);
373 file = req->rw.ki_filp;
374 file_count = 1;
375 }
376
def596e9
JA
377 if (to_free == ARRAY_SIZE(reqs))
378 io_free_req_many(ctx, reqs, &to_free);
379 }
380 io_commit_cqring(ctx);
381
9a56a232
JA
382 if (file)
383 fput_many(file, file_count);
def596e9
JA
384 io_free_req_many(ctx, reqs, &to_free);
385}
386
387static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
388 long min)
389{
390 struct io_kiocb *req, *tmp;
391 LIST_HEAD(done);
392 bool spin;
393 int ret;
394
395 /*
396 * Only spin for completions if we don't have multiple devices hanging
397 * off our complete list, and we're under the requested amount.
398 */
399 spin = !ctx->poll_multi_file && *nr_events < min;
400
401 ret = 0;
402 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
403 struct kiocb *kiocb = &req->rw;
404
405 /*
406 * Move completed entries to our local list. If we find a
407 * request that requires polling, break out and complete
408 * the done list first, if we have entries there.
409 */
410 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
411 list_move_tail(&req->list, &done);
412 continue;
413 }
414 if (!list_empty(&done))
415 break;
416
417 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
418 if (ret < 0)
419 break;
420
421 if (ret && spin)
422 spin = false;
423 ret = 0;
424 }
425
426 if (!list_empty(&done))
427 io_iopoll_complete(ctx, nr_events, &done);
428
429 return ret;
430}
431
432/*
433 * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
434 * non-spinning poll check - we'll still enter the driver poll loop, but only
435 * as a non-spinning completion check.
436 */
437static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
438 long min)
439{
440 while (!list_empty(&ctx->poll_list)) {
441 int ret;
442
443 ret = io_do_iopoll(ctx, nr_events, min);
444 if (ret < 0)
445 return ret;
446 if (!min || *nr_events >= min)
447 return 0;
448 }
449
450 return 1;
451}
452
453/*
454 * We can't just wait for polled events to come to us, we have to actively
455 * find and complete them.
456 */
457static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
458{
459 if (!(ctx->flags & IORING_SETUP_IOPOLL))
460 return;
461
462 mutex_lock(&ctx->uring_lock);
463 while (!list_empty(&ctx->poll_list)) {
464 unsigned int nr_events = 0;
465
466 io_iopoll_getevents(ctx, &nr_events, 1);
467 }
468 mutex_unlock(&ctx->uring_lock);
469}
470
471static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
472 long min)
473{
474 int ret = 0;
475
476 do {
477 int tmin = 0;
478
479 if (*nr_events < min)
480 tmin = min - *nr_events;
481
482 ret = io_iopoll_getevents(ctx, nr_events, tmin);
483 if (ret <= 0)
484 break;
485 ret = 0;
486 } while (min && !*nr_events && !need_resched());
487
488 return ret;
489}
490
2b188cc1
JA
491static void kiocb_end_write(struct kiocb *kiocb)
492{
493 if (kiocb->ki_flags & IOCB_WRITE) {
494 struct inode *inode = file_inode(kiocb->ki_filp);
495
496 /*
497 * Tell lockdep we inherited freeze protection from submission
498 * thread.
499 */
500 if (S_ISREG(inode->i_mode))
501 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
502 file_end_write(kiocb->ki_filp);
503 }
504}
505
506static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
507{
508 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
509
510 kiocb_end_write(kiocb);
511
512 fput(kiocb->ki_filp);
513 io_cqring_add_event(req->ctx, req->user_data, res, 0);
514 io_free_req(req);
515}
516
def596e9
JA
517static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
518{
519 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
520
521 kiocb_end_write(kiocb);
522
523 req->error = res;
524 if (res != -EAGAIN)
525 req->flags |= REQ_F_IOPOLL_COMPLETED;
526}
527
528/*
529 * After the iocb has been issued, it's safe to be found on the poll list.
530 * Adding the kiocb to the list AFTER submission ensures that we don't
531 * find it from a io_iopoll_getevents() thread before the issuer is done
532 * accessing the kiocb cookie.
533 */
534static void io_iopoll_req_issued(struct io_kiocb *req)
535{
536 struct io_ring_ctx *ctx = req->ctx;
537
538 /*
539 * Track whether we have multiple files in our lists. This will impact
540 * how we do polling eventually, not spinning if we're on potentially
541 * different devices.
542 */
543 if (list_empty(&ctx->poll_list)) {
544 ctx->poll_multi_file = false;
545 } else if (!ctx->poll_multi_file) {
546 struct io_kiocb *list_req;
547
548 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
549 list);
550 if (list_req->rw.ki_filp != req->rw.ki_filp)
551 ctx->poll_multi_file = true;
552 }
553
554 /*
555 * For fast devices, IO may have already completed. If it has, add
556 * it to the front so we find it first.
557 */
558 if (req->flags & REQ_F_IOPOLL_COMPLETED)
559 list_add(&req->list, &ctx->poll_list);
560 else
561 list_add_tail(&req->list, &ctx->poll_list);
562}
563
9a56a232
JA
564static void io_file_put(struct io_submit_state *state, struct file *file)
565{
566 if (!state) {
567 fput(file);
568 } else if (state->file) {
569 int diff = state->has_refs - state->used_refs;
570
571 if (diff)
572 fput_many(state->file, diff);
573 state->file = NULL;
574 }
575}
576
577/*
578 * Get as many references to a file as we have IOs left in this submission,
579 * assuming most submissions are for one file, or at least that each file
580 * has more than one submission.
581 */
582static struct file *io_file_get(struct io_submit_state *state, int fd)
583{
584 if (!state)
585 return fget(fd);
586
587 if (state->file) {
588 if (state->fd == fd) {
589 state->used_refs++;
590 state->ios_left--;
591 return state->file;
592 }
593 io_file_put(state, NULL);
594 }
595 state->file = fget_many(fd, state->ios_left);
596 if (!state->file)
597 return NULL;
598
599 state->fd = fd;
600 state->has_refs = state->ios_left;
601 state->used_refs = 1;
602 state->ios_left--;
603 return state->file;
604}
605
2b188cc1
JA
606/*
607 * If we tracked the file through the SCM inflight mechanism, we could support
608 * any file. For now, just ensure that anything potentially problematic is done
609 * inline.
610 */
611static bool io_file_supports_async(struct file *file)
612{
613 umode_t mode = file_inode(file)->i_mode;
614
615 if (S_ISBLK(mode) || S_ISCHR(mode))
616 return true;
617 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
618 return true;
619
620 return false;
621}
622
623static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
9a56a232 624 bool force_nonblock, struct io_submit_state *state)
2b188cc1 625{
def596e9 626 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
627 struct kiocb *kiocb = &req->rw;
628 unsigned ioprio;
629 int fd, ret;
630
631 /* For -EAGAIN retry, everything is already prepped */
632 if (kiocb->ki_filp)
633 return 0;
634
635 fd = READ_ONCE(sqe->fd);
9a56a232 636 kiocb->ki_filp = io_file_get(state, fd);
2b188cc1
JA
637 if (unlikely(!kiocb->ki_filp))
638 return -EBADF;
639 if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
640 force_nonblock = false;
641 kiocb->ki_pos = READ_ONCE(sqe->off);
642 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
643 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
644
645 ioprio = READ_ONCE(sqe->ioprio);
646 if (ioprio) {
647 ret = ioprio_check_cap(ioprio);
648 if (ret)
649 goto out_fput;
650
651 kiocb->ki_ioprio = ioprio;
652 } else
653 kiocb->ki_ioprio = get_current_ioprio();
654
655 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
656 if (unlikely(ret))
657 goto out_fput;
658 if (force_nonblock) {
659 kiocb->ki_flags |= IOCB_NOWAIT;
660 req->flags |= REQ_F_FORCE_NONBLOCK;
661 }
def596e9
JA
662 if (ctx->flags & IORING_SETUP_IOPOLL) {
663 ret = -EOPNOTSUPP;
664 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
665 !kiocb->ki_filp->f_op->iopoll)
666 goto out_fput;
2b188cc1 667
def596e9
JA
668 req->error = 0;
669 kiocb->ki_flags |= IOCB_HIPRI;
670 kiocb->ki_complete = io_complete_rw_iopoll;
671 } else {
672 if (kiocb->ki_flags & IOCB_HIPRI) {
673 ret = -EINVAL;
674 goto out_fput;
675 }
676 kiocb->ki_complete = io_complete_rw;
677 }
2b188cc1
JA
678 return 0;
679out_fput:
9a56a232
JA
680 /* in case of error, we didn't use this file reference. drop it. */
681 if (state)
682 state->used_refs--;
683 io_file_put(state, kiocb->ki_filp);
2b188cc1
JA
684 return ret;
685}
686
687static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
688{
689 switch (ret) {
690 case -EIOCBQUEUED:
691 break;
692 case -ERESTARTSYS:
693 case -ERESTARTNOINTR:
694 case -ERESTARTNOHAND:
695 case -ERESTART_RESTARTBLOCK:
696 /*
697 * We can't just restart the syscall, since previously
698 * submitted sqes may already be in progress. Just fail this
699 * IO with EINTR.
700 */
701 ret = -EINTR;
702 /* fall through */
703 default:
704 kiocb->ki_complete(kiocb, ret, 0);
705 }
706}
707
708static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
709 const struct sqe_submit *s, struct iovec **iovec,
710 struct iov_iter *iter)
711{
712 const struct io_uring_sqe *sqe = s->sqe;
713 void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
714 size_t sqe_len = READ_ONCE(sqe->len);
715
716 if (!s->has_user)
717 return -EFAULT;
718
719#ifdef CONFIG_COMPAT
720 if (ctx->compat)
721 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
722 iovec, iter);
723#endif
724
725 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
726}
727
728static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
9a56a232 729 bool force_nonblock, struct io_submit_state *state)
2b188cc1
JA
730{
731 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
732 struct kiocb *kiocb = &req->rw;
733 struct iov_iter iter;
734 struct file *file;
735 ssize_t ret;
736
9a56a232 737 ret = io_prep_rw(req, s->sqe, force_nonblock, state);
2b188cc1
JA
738 if (ret)
739 return ret;
740 file = kiocb->ki_filp;
741
742 ret = -EBADF;
743 if (unlikely(!(file->f_mode & FMODE_READ)))
744 goto out_fput;
745 ret = -EINVAL;
746 if (unlikely(!file->f_op->read_iter))
747 goto out_fput;
748
749 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
750 if (ret)
751 goto out_fput;
752
753 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter));
754 if (!ret) {
755 ssize_t ret2;
756
757 /* Catch -EAGAIN return for forced non-blocking submission */
758 ret2 = call_read_iter(file, kiocb, &iter);
759 if (!force_nonblock || ret2 != -EAGAIN)
760 io_rw_done(kiocb, ret2);
761 else
762 ret = -EAGAIN;
763 }
764 kfree(iovec);
765out_fput:
766 /* Hold on to the file for -EAGAIN */
767 if (unlikely(ret && ret != -EAGAIN))
768 fput(file);
769 return ret;
770}
771
772static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
9a56a232 773 bool force_nonblock, struct io_submit_state *state)
2b188cc1
JA
774{
775 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
776 struct kiocb *kiocb = &req->rw;
777 struct iov_iter iter;
778 struct file *file;
779 ssize_t ret;
780
9a56a232 781 ret = io_prep_rw(req, s->sqe, force_nonblock, state);
2b188cc1
JA
782 if (ret)
783 return ret;
784 /* Hold on to the file for -EAGAIN */
785 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
786 return -EAGAIN;
787
788 ret = -EBADF;
789 file = kiocb->ki_filp;
790 if (unlikely(!(file->f_mode & FMODE_WRITE)))
791 goto out_fput;
792 ret = -EINVAL;
793 if (unlikely(!file->f_op->write_iter))
794 goto out_fput;
795
796 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
797 if (ret)
798 goto out_fput;
799
800 ret = rw_verify_area(WRITE, file, &kiocb->ki_pos,
801 iov_iter_count(&iter));
802 if (!ret) {
803 /*
804 * Open-code file_start_write here to grab freeze protection,
805 * which will be released by another thread in
806 * io_complete_rw(). Fool lockdep by telling it the lock got
807 * released so that it doesn't complain about the held lock when
808 * we return to userspace.
809 */
810 if (S_ISREG(file_inode(file)->i_mode)) {
811 __sb_start_write(file_inode(file)->i_sb,
812 SB_FREEZE_WRITE, true);
813 __sb_writers_release(file_inode(file)->i_sb,
814 SB_FREEZE_WRITE);
815 }
816 kiocb->ki_flags |= IOCB_WRITE;
817 io_rw_done(kiocb, call_write_iter(file, kiocb, &iter));
818 }
819 kfree(iovec);
820out_fput:
821 if (unlikely(ret))
822 fput(file);
823 return ret;
824}
825
826/*
827 * IORING_OP_NOP just posts a completion event, nothing else.
828 */
829static int io_nop(struct io_kiocb *req, u64 user_data)
830{
831 struct io_ring_ctx *ctx = req->ctx;
832 long err = 0;
833
def596e9
JA
834 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
835 return -EINVAL;
836
2b188cc1
JA
837 /*
838 * Twilight zone - it's possible that someone issued an opcode that
839 * has a file attached, then got -EAGAIN on submission, and changed
840 * the sqe before we retried it from async context. Avoid dropping
841 * a file reference for this malicious case, and flag the error.
842 */
843 if (req->rw.ki_filp) {
844 err = -EBADF;
845 fput(req->rw.ki_filp);
846 }
847 io_cqring_add_event(ctx, user_data, err, 0);
848 io_free_req(req);
849 return 0;
850}
851
c992fe29
CH
852static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
853{
854 int fd;
855
856 /* Prep already done */
857 if (req->rw.ki_filp)
858 return 0;
859
def596e9
JA
860 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
861 return -EINVAL;
c992fe29
CH
862 if (unlikely(sqe->addr || sqe->ioprio))
863 return -EINVAL;
864
865 fd = READ_ONCE(sqe->fd);
866 req->rw.ki_filp = fget(fd);
867 if (unlikely(!req->rw.ki_filp))
868 return -EBADF;
869
870 return 0;
871}
872
873static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
874 bool force_nonblock)
875{
876 loff_t sqe_off = READ_ONCE(sqe->off);
877 loff_t sqe_len = READ_ONCE(sqe->len);
878 loff_t end = sqe_off + sqe_len;
879 unsigned fsync_flags;
880 int ret;
881
882 fsync_flags = READ_ONCE(sqe->fsync_flags);
883 if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
884 return -EINVAL;
885
886 ret = io_prep_fsync(req, sqe);
887 if (ret)
888 return ret;
889
890 /* fsync always requires a blocking context */
891 if (force_nonblock)
892 return -EAGAIN;
893
894 ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
895 end > 0 ? end : LLONG_MAX,
896 fsync_flags & IORING_FSYNC_DATASYNC);
897
898 fput(req->rw.ki_filp);
899 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
900 io_free_req(req);
901 return 0;
902}
903
2b188cc1 904static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
9a56a232
JA
905 const struct sqe_submit *s, bool force_nonblock,
906 struct io_submit_state *state)
2b188cc1
JA
907{
908 ssize_t ret;
909 int opcode;
910
911 if (unlikely(s->index >= ctx->sq_entries))
912 return -EINVAL;
913 req->user_data = READ_ONCE(s->sqe->user_data);
914
915 opcode = READ_ONCE(s->sqe->opcode);
916 switch (opcode) {
917 case IORING_OP_NOP:
918 ret = io_nop(req, req->user_data);
919 break;
920 case IORING_OP_READV:
9a56a232 921 ret = io_read(req, s, force_nonblock, state);
2b188cc1
JA
922 break;
923 case IORING_OP_WRITEV:
9a56a232 924 ret = io_write(req, s, force_nonblock, state);
2b188cc1 925 break;
c992fe29
CH
926 case IORING_OP_FSYNC:
927 ret = io_fsync(req, s->sqe, force_nonblock);
928 break;
2b188cc1
JA
929 default:
930 ret = -EINVAL;
931 break;
932 }
933
def596e9
JA
934 if (ret)
935 return ret;
936
937 if (ctx->flags & IORING_SETUP_IOPOLL) {
938 if (req->error == -EAGAIN)
939 return -EAGAIN;
940
941 /* workqueue context doesn't hold uring_lock, grab it now */
942 if (s->needs_lock)
943 mutex_lock(&ctx->uring_lock);
944 io_iopoll_req_issued(req);
945 if (s->needs_lock)
946 mutex_unlock(&ctx->uring_lock);
947 }
948
949 return 0;
2b188cc1
JA
950}
951
952static void io_sq_wq_submit_work(struct work_struct *work)
953{
954 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
955 struct sqe_submit *s = &req->submit;
956 const struct io_uring_sqe *sqe = s->sqe;
957 struct io_ring_ctx *ctx = req->ctx;
958 mm_segment_t old_fs = get_fs();
959 int ret;
960
961 /* Ensure we clear previously set forced non-block flag */
962 req->flags &= ~REQ_F_FORCE_NONBLOCK;
963 req->rw.ki_flags &= ~IOCB_NOWAIT;
964
965 if (!mmget_not_zero(ctx->sqo_mm)) {
966 ret = -EFAULT;
967 goto err;
968 }
969
970 use_mm(ctx->sqo_mm);
971 set_fs(USER_DS);
972 s->has_user = true;
def596e9 973 s->needs_lock = true;
2b188cc1 974
def596e9 975 do {
9a56a232 976 ret = __io_submit_sqe(ctx, req, s, false, NULL);
def596e9
JA
977 /*
978 * We can get EAGAIN for polled IO even though we're forcing
979 * a sync submission from here, since we can't wait for
980 * request slots on the block side.
981 */
982 if (ret != -EAGAIN)
983 break;
984 cond_resched();
985 } while (1);
2b188cc1
JA
986
987 set_fs(old_fs);
988 unuse_mm(ctx->sqo_mm);
989 mmput(ctx->sqo_mm);
990err:
991 if (ret) {
992 io_cqring_add_event(ctx, sqe->user_data, ret, 0);
993 io_free_req(req);
994 }
995
996 /* async context always use a copy of the sqe */
997 kfree(sqe);
998}
999
9a56a232
JA
1000static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
1001 struct io_submit_state *state)
2b188cc1
JA
1002{
1003 struct io_kiocb *req;
1004 ssize_t ret;
1005
1006 /* enforce forwards compatibility on users */
1007 if (unlikely(s->sqe->flags))
1008 return -EINVAL;
1009
1010 req = io_get_req(ctx);
1011 if (unlikely(!req))
1012 return -EAGAIN;
1013
1014 req->rw.ki_filp = NULL;
1015
9a56a232 1016 ret = __io_submit_sqe(ctx, req, s, true, state);
2b188cc1
JA
1017 if (ret == -EAGAIN) {
1018 struct io_uring_sqe *sqe_copy;
1019
1020 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
1021 if (sqe_copy) {
1022 memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
1023 s->sqe = sqe_copy;
1024
1025 memcpy(&req->submit, s, sizeof(*s));
1026 INIT_WORK(&req->work, io_sq_wq_submit_work);
1027 queue_work(ctx->sqo_wq, &req->work);
1028 ret = 0;
1029 }
1030 }
1031 if (ret)
1032 io_free_req(req);
1033
1034 return ret;
1035}
1036
9a56a232
JA
1037/*
1038 * Batched submission is done, ensure local IO is flushed out.
1039 */
1040static void io_submit_state_end(struct io_submit_state *state)
1041{
1042 blk_finish_plug(&state->plug);
1043 io_file_put(state, NULL);
1044}
1045
1046/*
1047 * Start submission side cache.
1048 */
1049static void io_submit_state_start(struct io_submit_state *state,
1050 struct io_ring_ctx *ctx, unsigned max_ios)
1051{
1052 blk_start_plug(&state->plug);
1053 state->file = NULL;
1054 state->ios_left = max_ios;
1055}
1056
2b188cc1
JA
1057static void io_commit_sqring(struct io_ring_ctx *ctx)
1058{
1059 struct io_sq_ring *ring = ctx->sq_ring;
1060
1061 if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
1062 /*
1063 * Ensure any loads from the SQEs are done at this point,
1064 * since once we write the new head, the application could
1065 * write new data to them.
1066 */
1067 smp_store_release(&ring->r.head, ctx->cached_sq_head);
1068
1069 /*
1070 * write side barrier of head update, app has read side. See
1071 * comment at the top of this file
1072 */
1073 smp_wmb();
1074 }
1075}
1076
1077/*
1078 * Undo last io_get_sqring()
1079 */
1080static void io_drop_sqring(struct io_ring_ctx *ctx)
1081{
1082 ctx->cached_sq_head--;
1083}
1084
1085/*
1086 * Fetch an sqe, if one is available. Note that s->sqe will point to memory
1087 * that is mapped by userspace. This means that care needs to be taken to
1088 * ensure that reads are stable, as we cannot rely on userspace always
1089 * being a good citizen. If members of the sqe are validated and then later
1090 * used, it's important that those reads are done through READ_ONCE() to
1091 * prevent a re-load down the line.
1092 */
1093static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
1094{
1095 struct io_sq_ring *ring = ctx->sq_ring;
1096 unsigned head;
1097
1098 /*
1099 * The cached sq head (or cq tail) serves two purposes:
1100 *
1101 * 1) allows us to batch the cost of updating the user visible
1102 * head updates.
1103 * 2) allows the kernel side to track the head on its own, even
1104 * though the application is the one updating it.
1105 */
1106 head = ctx->cached_sq_head;
1107 /* See comment at the top of this file */
1108 smp_rmb();
1109 if (head == READ_ONCE(ring->r.tail))
1110 return false;
1111
1112 head = READ_ONCE(ring->array[head & ctx->sq_mask]);
1113 if (head < ctx->sq_entries) {
1114 s->index = head;
1115 s->sqe = &ctx->sq_sqes[head];
1116 ctx->cached_sq_head++;
1117 return true;
1118 }
1119
1120 /* drop invalid entries */
1121 ctx->cached_sq_head++;
1122 ring->dropped++;
1123 /* See comment at the top of this file */
1124 smp_wmb();
1125 return false;
1126}
1127
1128static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
1129{
9a56a232 1130 struct io_submit_state state, *statep = NULL;
2b188cc1 1131 int i, ret = 0, submit = 0;
2b188cc1 1132
9a56a232
JA
1133 if (to_submit > IO_PLUG_THRESHOLD) {
1134 io_submit_state_start(&state, ctx, to_submit);
1135 statep = &state;
1136 }
2b188cc1
JA
1137
1138 for (i = 0; i < to_submit; i++) {
1139 struct sqe_submit s;
1140
1141 if (!io_get_sqring(ctx, &s))
1142 break;
1143
1144 s.has_user = true;
def596e9
JA
1145 s.needs_lock = false;
1146
9a56a232 1147 ret = io_submit_sqe(ctx, &s, statep);
2b188cc1
JA
1148 if (ret) {
1149 io_drop_sqring(ctx);
1150 break;
1151 }
1152
1153 submit++;
1154 }
1155 io_commit_sqring(ctx);
1156
9a56a232
JA
1157 if (statep)
1158 io_submit_state_end(statep);
2b188cc1
JA
1159
1160 return submit ? submit : ret;
1161}
1162
1163static unsigned io_cqring_events(struct io_cq_ring *ring)
1164{
1165 return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
1166}
1167
1168/*
1169 * Wait until events become available, if we don't already have some. The
1170 * application must reap them itself, as they reside on the shared cq ring.
1171 */
1172static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
1173 const sigset_t __user *sig, size_t sigsz)
1174{
1175 struct io_cq_ring *ring = ctx->cq_ring;
1176 sigset_t ksigmask, sigsaved;
1177 DEFINE_WAIT(wait);
1178 int ret;
1179
1180 /* See comment at the top of this file */
1181 smp_rmb();
1182 if (io_cqring_events(ring) >= min_events)
1183 return 0;
1184
1185 if (sig) {
1186 ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz);
1187 if (ret)
1188 return ret;
1189 }
1190
1191 do {
1192 prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE);
1193
1194 ret = 0;
1195 /* See comment at the top of this file */
1196 smp_rmb();
1197 if (io_cqring_events(ring) >= min_events)
1198 break;
1199
1200 schedule();
1201
1202 ret = -EINTR;
1203 if (signal_pending(current))
1204 break;
1205 } while (1);
1206
1207 finish_wait(&ctx->wait, &wait);
1208
1209 if (sig)
1210 restore_user_sigmask(sig, &sigsaved);
1211
1212 return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
1213}
1214
1215static int io_sq_offload_start(struct io_ring_ctx *ctx)
1216{
1217 int ret;
1218
1219 mmgrab(current->mm);
1220 ctx->sqo_mm = current->mm;
1221
1222 /* Do QD, or 2 * CPUS, whatever is smallest */
1223 ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
1224 min(ctx->sq_entries - 1, 2 * num_online_cpus()));
1225 if (!ctx->sqo_wq) {
1226 ret = -ENOMEM;
1227 goto err;
1228 }
1229
1230 return 0;
1231err:
1232 mmdrop(ctx->sqo_mm);
1233 ctx->sqo_mm = NULL;
1234 return ret;
1235}
1236
1237static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
1238{
1239 atomic_long_sub(nr_pages, &user->locked_vm);
1240}
1241
1242static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
1243{
1244 unsigned long page_limit, cur_pages, new_pages;
1245
1246 /* Don't allow more pages than we can safely lock */
1247 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1248
1249 do {
1250 cur_pages = atomic_long_read(&user->locked_vm);
1251 new_pages = cur_pages + nr_pages;
1252 if (new_pages > page_limit)
1253 return -ENOMEM;
1254 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
1255 new_pages) != cur_pages);
1256
1257 return 0;
1258}
1259
1260static void io_mem_free(void *ptr)
1261{
1262 struct page *page = virt_to_head_page(ptr);
1263
1264 if (put_page_testzero(page))
1265 free_compound_page(page);
1266}
1267
1268static void *io_mem_alloc(size_t size)
1269{
1270 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
1271 __GFP_NORETRY;
1272
1273 return (void *) __get_free_pages(gfp_flags, get_order(size));
1274}
1275
1276static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
1277{
1278 struct io_sq_ring *sq_ring;
1279 struct io_cq_ring *cq_ring;
1280 size_t bytes;
1281
1282 bytes = struct_size(sq_ring, array, sq_entries);
1283 bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
1284 bytes += struct_size(cq_ring, cqes, cq_entries);
1285
1286 return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1287}
1288
1289static void io_ring_ctx_free(struct io_ring_ctx *ctx)
1290{
1291 if (ctx->sqo_wq)
1292 destroy_workqueue(ctx->sqo_wq);
1293 if (ctx->sqo_mm)
1294 mmdrop(ctx->sqo_mm);
def596e9
JA
1295
1296 io_iopoll_reap_events(ctx);
1297
2b188cc1
JA
1298#if defined(CONFIG_UNIX)
1299 if (ctx->ring_sock)
1300 sock_release(ctx->ring_sock);
1301#endif
1302
1303 io_mem_free(ctx->sq_ring);
1304 io_mem_free(ctx->sq_sqes);
1305 io_mem_free(ctx->cq_ring);
1306
1307 percpu_ref_exit(&ctx->refs);
1308 if (ctx->account_mem)
1309 io_unaccount_mem(ctx->user,
1310 ring_pages(ctx->sq_entries, ctx->cq_entries));
1311 free_uid(ctx->user);
1312 kfree(ctx);
1313}
1314
1315static __poll_t io_uring_poll(struct file *file, poll_table *wait)
1316{
1317 struct io_ring_ctx *ctx = file->private_data;
1318 __poll_t mask = 0;
1319
1320 poll_wait(file, &ctx->cq_wait, wait);
1321 /* See comment at the top of this file */
1322 smp_rmb();
1323 if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head)
1324 mask |= EPOLLOUT | EPOLLWRNORM;
1325 if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
1326 mask |= EPOLLIN | EPOLLRDNORM;
1327
1328 return mask;
1329}
1330
1331static int io_uring_fasync(int fd, struct file *file, int on)
1332{
1333 struct io_ring_ctx *ctx = file->private_data;
1334
1335 return fasync_helper(fd, file, on, &ctx->cq_fasync);
1336}
1337
1338static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
1339{
1340 mutex_lock(&ctx->uring_lock);
1341 percpu_ref_kill(&ctx->refs);
1342 mutex_unlock(&ctx->uring_lock);
1343
def596e9 1344 io_iopoll_reap_events(ctx);
2b188cc1
JA
1345 wait_for_completion(&ctx->ctx_done);
1346 io_ring_ctx_free(ctx);
1347}
1348
1349static int io_uring_release(struct inode *inode, struct file *file)
1350{
1351 struct io_ring_ctx *ctx = file->private_data;
1352
1353 file->private_data = NULL;
1354 io_ring_ctx_wait_and_kill(ctx);
1355 return 0;
1356}
1357
1358static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
1359{
1360 loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
1361 unsigned long sz = vma->vm_end - vma->vm_start;
1362 struct io_ring_ctx *ctx = file->private_data;
1363 unsigned long pfn;
1364 struct page *page;
1365 void *ptr;
1366
1367 switch (offset) {
1368 case IORING_OFF_SQ_RING:
1369 ptr = ctx->sq_ring;
1370 break;
1371 case IORING_OFF_SQES:
1372 ptr = ctx->sq_sqes;
1373 break;
1374 case IORING_OFF_CQ_RING:
1375 ptr = ctx->cq_ring;
1376 break;
1377 default:
1378 return -EINVAL;
1379 }
1380
1381 page = virt_to_head_page(ptr);
1382 if (sz > (PAGE_SIZE << compound_order(page)))
1383 return -EINVAL;
1384
1385 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
1386 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
1387}
1388
1389SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
1390 u32, min_complete, u32, flags, const sigset_t __user *, sig,
1391 size_t, sigsz)
1392{
1393 struct io_ring_ctx *ctx;
1394 long ret = -EBADF;
1395 int submitted = 0;
1396 struct fd f;
1397
1398 if (flags & ~IORING_ENTER_GETEVENTS)
1399 return -EINVAL;
1400
1401 f = fdget(fd);
1402 if (!f.file)
1403 return -EBADF;
1404
1405 ret = -EOPNOTSUPP;
1406 if (f.file->f_op != &io_uring_fops)
1407 goto out_fput;
1408
1409 ret = -ENXIO;
1410 ctx = f.file->private_data;
1411 if (!percpu_ref_tryget(&ctx->refs))
1412 goto out_fput;
1413
1414 ret = 0;
1415 if (to_submit) {
1416 to_submit = min(to_submit, ctx->sq_entries);
1417
1418 mutex_lock(&ctx->uring_lock);
1419 submitted = io_ring_submit(ctx, to_submit);
1420 mutex_unlock(&ctx->uring_lock);
1421
1422 if (submitted < 0)
1423 goto out_ctx;
1424 }
1425 if (flags & IORING_ENTER_GETEVENTS) {
def596e9
JA
1426 unsigned nr_events = 0;
1427
2b188cc1
JA
1428 min_complete = min(min_complete, ctx->cq_entries);
1429
1430 /*
1431 * The application could have included the 'to_submit' count
1432 * in how many events it wanted to wait for. If we failed to
1433 * submit the desired count, we may need to adjust the number
1434 * of events to poll/wait for.
1435 */
1436 if (submitted < to_submit)
1437 min_complete = min_t(unsigned, submitted, min_complete);
1438
def596e9
JA
1439 if (ctx->flags & IORING_SETUP_IOPOLL) {
1440 mutex_lock(&ctx->uring_lock);
1441 ret = io_iopoll_check(ctx, &nr_events, min_complete);
1442 mutex_unlock(&ctx->uring_lock);
1443 } else {
1444 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
1445 }
2b188cc1
JA
1446 }
1447
1448out_ctx:
1449 io_ring_drop_ctx_refs(ctx, 1);
1450out_fput:
1451 fdput(f);
1452 return submitted ? submitted : ret;
1453}
1454
1455static const struct file_operations io_uring_fops = {
1456 .release = io_uring_release,
1457 .mmap = io_uring_mmap,
1458 .poll = io_uring_poll,
1459 .fasync = io_uring_fasync,
1460};
1461
1462static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
1463 struct io_uring_params *p)
1464{
1465 struct io_sq_ring *sq_ring;
1466 struct io_cq_ring *cq_ring;
1467 size_t size;
1468
1469 sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
1470 if (!sq_ring)
1471 return -ENOMEM;
1472
1473 ctx->sq_ring = sq_ring;
1474 sq_ring->ring_mask = p->sq_entries - 1;
1475 sq_ring->ring_entries = p->sq_entries;
1476 ctx->sq_mask = sq_ring->ring_mask;
1477 ctx->sq_entries = sq_ring->ring_entries;
1478
1479 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
1480 if (size == SIZE_MAX)
1481 return -EOVERFLOW;
1482
1483 ctx->sq_sqes = io_mem_alloc(size);
1484 if (!ctx->sq_sqes) {
1485 io_mem_free(ctx->sq_ring);
1486 return -ENOMEM;
1487 }
1488
1489 cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
1490 if (!cq_ring) {
1491 io_mem_free(ctx->sq_ring);
1492 io_mem_free(ctx->sq_sqes);
1493 return -ENOMEM;
1494 }
1495
1496 ctx->cq_ring = cq_ring;
1497 cq_ring->ring_mask = p->cq_entries - 1;
1498 cq_ring->ring_entries = p->cq_entries;
1499 ctx->cq_mask = cq_ring->ring_mask;
1500 ctx->cq_entries = cq_ring->ring_entries;
1501 return 0;
1502}
1503
1504/*
1505 * Allocate an anonymous fd, this is what constitutes the application
1506 * visible backing of an io_uring instance. The application mmaps this
1507 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
1508 * we have to tie this fd to a socket for file garbage collection purposes.
1509 */
1510static int io_uring_get_fd(struct io_ring_ctx *ctx)
1511{
1512 struct file *file;
1513 int ret;
1514
1515#if defined(CONFIG_UNIX)
1516 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
1517 &ctx->ring_sock);
1518 if (ret)
1519 return ret;
1520#endif
1521
1522 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
1523 if (ret < 0)
1524 goto err;
1525
1526 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
1527 O_RDWR | O_CLOEXEC);
1528 if (IS_ERR(file)) {
1529 put_unused_fd(ret);
1530 ret = PTR_ERR(file);
1531 goto err;
1532 }
1533
1534#if defined(CONFIG_UNIX)
1535 ctx->ring_sock->file = file;
1536#endif
1537 fd_install(ret, file);
1538 return ret;
1539err:
1540#if defined(CONFIG_UNIX)
1541 sock_release(ctx->ring_sock);
1542 ctx->ring_sock = NULL;
1543#endif
1544 return ret;
1545}
1546
1547static int io_uring_create(unsigned entries, struct io_uring_params *p)
1548{
1549 struct user_struct *user = NULL;
1550 struct io_ring_ctx *ctx;
1551 bool account_mem;
1552 int ret;
1553
1554 if (!entries || entries > IORING_MAX_ENTRIES)
1555 return -EINVAL;
1556
1557 /*
1558 * Use twice as many entries for the CQ ring. It's possible for the
1559 * application to drive a higher depth than the size of the SQ ring,
1560 * since the sqes are only used at submission time. This allows for
1561 * some flexibility in overcommitting a bit.
1562 */
1563 p->sq_entries = roundup_pow_of_two(entries);
1564 p->cq_entries = 2 * p->sq_entries;
1565
1566 user = get_uid(current_user());
1567 account_mem = !capable(CAP_IPC_LOCK);
1568
1569 if (account_mem) {
1570 ret = io_account_mem(user,
1571 ring_pages(p->sq_entries, p->cq_entries));
1572 if (ret) {
1573 free_uid(user);
1574 return ret;
1575 }
1576 }
1577
1578 ctx = io_ring_ctx_alloc(p);
1579 if (!ctx) {
1580 if (account_mem)
1581 io_unaccount_mem(user, ring_pages(p->sq_entries,
1582 p->cq_entries));
1583 free_uid(user);
1584 return -ENOMEM;
1585 }
1586 ctx->compat = in_compat_syscall();
1587 ctx->account_mem = account_mem;
1588 ctx->user = user;
1589
1590 ret = io_allocate_scq_urings(ctx, p);
1591 if (ret)
1592 goto err;
1593
1594 ret = io_sq_offload_start(ctx);
1595 if (ret)
1596 goto err;
1597
1598 ret = io_uring_get_fd(ctx);
1599 if (ret < 0)
1600 goto err;
1601
1602 memset(&p->sq_off, 0, sizeof(p->sq_off));
1603 p->sq_off.head = offsetof(struct io_sq_ring, r.head);
1604 p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
1605 p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
1606 p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
1607 p->sq_off.flags = offsetof(struct io_sq_ring, flags);
1608 p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
1609 p->sq_off.array = offsetof(struct io_sq_ring, array);
1610
1611 memset(&p->cq_off, 0, sizeof(p->cq_off));
1612 p->cq_off.head = offsetof(struct io_cq_ring, r.head);
1613 p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
1614 p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
1615 p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
1616 p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
1617 p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
1618 return ret;
1619err:
1620 io_ring_ctx_wait_and_kill(ctx);
1621 return ret;
1622}
1623
1624/*
1625 * Sets up an aio uring context, and returns the fd. Applications asks for a
1626 * ring size, we return the actual sq/cq ring sizes (among other things) in the
1627 * params structure passed in.
1628 */
1629static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
1630{
1631 struct io_uring_params p;
1632 long ret;
1633 int i;
1634
1635 if (copy_from_user(&p, params, sizeof(p)))
1636 return -EFAULT;
1637 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
1638 if (p.resv[i])
1639 return -EINVAL;
1640 }
1641
def596e9 1642 if (p.flags & ~IORING_SETUP_IOPOLL)
2b188cc1
JA
1643 return -EINVAL;
1644
1645 ret = io_uring_create(entries, &p);
1646 if (ret < 0)
1647 return ret;
1648
1649 if (copy_to_user(params, &p, sizeof(p)))
1650 return -EFAULT;
1651
1652 return ret;
1653}
1654
1655SYSCALL_DEFINE2(io_uring_setup, u32, entries,
1656 struct io_uring_params __user *, params)
1657{
1658 return io_uring_setup(entries, params);
1659}
1660
1661static int __init io_uring_init(void)
1662{
1663 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
1664 return 0;
1665};
1666__initcall(io_uring_init);