io_uring: Fix unused function warnings
[linux-block.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqring (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
52de1fe1 47#include <net/compat.h>
2b188cc1
JA
48#include <linux/refcount.h>
49#include <linux/uio.h>
6b47ee6e 50#include <linux/bits.h>
2b188cc1
JA
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
58#include <linux/mmu_context.h>
59#include <linux/percpu.h>
60#include <linux/slab.h>
6c271ce2 61#include <linux/kthread.h>
2b188cc1 62#include <linux/blkdev.h>
edafccee 63#include <linux/bvec.h>
2b188cc1
JA
64#include <linux/net.h>
65#include <net/sock.h>
66#include <net/af_unix.h>
6b06314c 67#include <net/scm.h>
2b188cc1
JA
68#include <linux/anon_inodes.h>
69#include <linux/sched/mm.h>
70#include <linux/uaccess.h>
71#include <linux/nospec.h>
edafccee
JA
72#include <linux/sizes.h>
73#include <linux/hugetlb.h>
aa4c3967 74#include <linux/highmem.h>
15b71abe
JA
75#include <linux/namei.h>
76#include <linux/fsnotify.h>
4840e418 77#include <linux/fadvise.h>
3e4827b0 78#include <linux/eventpoll.h>
ff002b30 79#include <linux/fs_struct.h>
7d67af2c 80#include <linux/splice.h>
b41e9852 81#include <linux/task_work.h>
2b188cc1 82
c826bd7a
DD
83#define CREATE_TRACE_POINTS
84#include <trace/events/io_uring.h>
85
2b188cc1
JA
86#include <uapi/linux/io_uring.h>
87
88#include "internal.h"
561fb04a 89#include "io-wq.h"
2b188cc1 90
5277deaa 91#define IORING_MAX_ENTRIES 32768
33a107f0 92#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
65e19f54
JA
93
94/*
95 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
96 */
97#define IORING_FILE_TABLE_SHIFT 9
98#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
99#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
100#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
2b188cc1
JA
101
102struct io_uring {
103 u32 head ____cacheline_aligned_in_smp;
104 u32 tail ____cacheline_aligned_in_smp;
105};
106
1e84b97b 107/*
75b28aff
HV
108 * This data is shared with the application through the mmap at offsets
109 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
110 *
111 * The offsets to the member fields are published through struct
112 * io_sqring_offsets when calling io_uring_setup.
113 */
75b28aff 114struct io_rings {
1e84b97b
SB
115 /*
116 * Head and tail offsets into the ring; the offsets need to be
117 * masked to get valid indices.
118 *
75b28aff
HV
119 * The kernel controls head of the sq ring and the tail of the cq ring,
120 * and the application controls tail of the sq ring and the head of the
121 * cq ring.
1e84b97b 122 */
75b28aff 123 struct io_uring sq, cq;
1e84b97b 124 /*
75b28aff 125 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
126 * ring_entries - 1)
127 */
75b28aff
HV
128 u32 sq_ring_mask, cq_ring_mask;
129 /* Ring sizes (constant, power of 2) */
130 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
131 /*
132 * Number of invalid entries dropped by the kernel due to
133 * invalid index stored in array
134 *
135 * Written by the kernel, shouldn't be modified by the
136 * application (i.e. get number of "new events" by comparing to
137 * cached value).
138 *
139 * After a new SQ head value was read by the application this
140 * counter includes all submissions that were dropped reaching
141 * the new SQ head (and possibly more).
142 */
75b28aff 143 u32 sq_dropped;
1e84b97b
SB
144 /*
145 * Runtime flags
146 *
147 * Written by the kernel, shouldn't be modified by the
148 * application.
149 *
150 * The application needs a full memory barrier before checking
151 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
152 */
75b28aff 153 u32 sq_flags;
1e84b97b
SB
154 /*
155 * Number of completion events lost because the queue was full;
156 * this should be avoided by the application by making sure
0b4295b5 157 * there are not more requests pending than there is space in
1e84b97b
SB
158 * the completion queue.
159 *
160 * Written by the kernel, shouldn't be modified by the
161 * application (i.e. get number of "new events" by comparing to
162 * cached value).
163 *
164 * As completion events come in out of order this counter is not
165 * ordered with any other data.
166 */
75b28aff 167 u32 cq_overflow;
1e84b97b
SB
168 /*
169 * Ring buffer of completion events.
170 *
171 * The kernel writes completion events fresh every time they are
172 * produced, so the application is allowed to modify pending
173 * entries.
174 */
75b28aff 175 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
176};
177
edafccee
JA
178struct io_mapped_ubuf {
179 u64 ubuf;
180 size_t len;
181 struct bio_vec *bvec;
182 unsigned int nr_bvecs;
183};
184
65e19f54
JA
185struct fixed_file_table {
186 struct file **files;
31b51510
JA
187};
188
05f3fb3c
JA
189struct fixed_file_data {
190 struct fixed_file_table *table;
191 struct io_ring_ctx *ctx;
192
193 struct percpu_ref refs;
194 struct llist_head put_llist;
05f3fb3c
JA
195 struct work_struct ref_work;
196 struct completion done;
197};
198
5a2e745d
JA
199struct io_buffer {
200 struct list_head list;
201 __u64 addr;
202 __s32 len;
203 __u16 bid;
204};
205
2b188cc1
JA
206struct io_ring_ctx {
207 struct {
208 struct percpu_ref refs;
209 } ____cacheline_aligned_in_smp;
210
211 struct {
212 unsigned int flags;
e1d85334
RD
213 unsigned int compat: 1;
214 unsigned int account_mem: 1;
215 unsigned int cq_overflow_flushed: 1;
216 unsigned int drain_next: 1;
217 unsigned int eventfd_async: 1;
2b188cc1 218
75b28aff
HV
219 /*
220 * Ring buffer of indices into array of io_uring_sqe, which is
221 * mmapped by the application using the IORING_OFF_SQES offset.
222 *
223 * This indirection could e.g. be used to assign fixed
224 * io_uring_sqe entries to operations and only submit them to
225 * the queue when needed.
226 *
227 * The kernel modifies neither the indices array nor the entries
228 * array.
229 */
230 u32 *sq_array;
2b188cc1
JA
231 unsigned cached_sq_head;
232 unsigned sq_entries;
233 unsigned sq_mask;
6c271ce2 234 unsigned sq_thread_idle;
498ccd9e 235 unsigned cached_sq_dropped;
206aefde 236 atomic_t cached_cq_overflow;
ad3eb2c8 237 unsigned long sq_check_overflow;
de0617e4
JA
238
239 struct list_head defer_list;
5262f567 240 struct list_head timeout_list;
1d7bb1d5 241 struct list_head cq_overflow_list;
fcb323cc
JA
242
243 wait_queue_head_t inflight_wait;
ad3eb2c8 244 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
245 } ____cacheline_aligned_in_smp;
246
206aefde
JA
247 struct io_rings *rings;
248
2b188cc1 249 /* IO offload */
561fb04a 250 struct io_wq *io_wq;
6c271ce2 251 struct task_struct *sqo_thread; /* if using sq thread polling */
2b188cc1 252 struct mm_struct *sqo_mm;
6c271ce2 253 wait_queue_head_t sqo_wait;
75b28aff 254
6b06314c
JA
255 /*
256 * If used, fixed file set. Writers must ensure that ->refs is dead,
257 * readers must ensure that ->refs is alive as long as the file* is
258 * used. Only updated through io_uring_register(2).
259 */
05f3fb3c 260 struct fixed_file_data *file_data;
6b06314c 261 unsigned nr_user_files;
b14cca0c
PB
262 int ring_fd;
263 struct file *ring_file;
6b06314c 264
edafccee
JA
265 /* if used, fixed mapped user buffers */
266 unsigned nr_user_bufs;
267 struct io_mapped_ubuf *user_bufs;
268
2b188cc1
JA
269 struct user_struct *user;
270
0b8c0ec7 271 const struct cred *creds;
181e448d 272
206aefde
JA
273 /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
274 struct completion *completions;
275
0ddf92e8
JA
276 /* if all else fails... */
277 struct io_kiocb *fallback_req;
278
206aefde
JA
279#if defined(CONFIG_UNIX)
280 struct socket *ring_sock;
281#endif
282
5a2e745d
JA
283 struct idr io_buffer_idr;
284
071698e1
JA
285 struct idr personality_idr;
286
206aefde
JA
287 struct {
288 unsigned cached_cq_tail;
289 unsigned cq_entries;
290 unsigned cq_mask;
291 atomic_t cq_timeouts;
ad3eb2c8 292 unsigned long cq_check_overflow;
206aefde
JA
293 struct wait_queue_head cq_wait;
294 struct fasync_struct *cq_fasync;
295 struct eventfd_ctx *cq_ev_fd;
296 } ____cacheline_aligned_in_smp;
2b188cc1
JA
297
298 struct {
299 struct mutex uring_lock;
300 wait_queue_head_t wait;
301 } ____cacheline_aligned_in_smp;
302
303 struct {
304 spinlock_t completion_lock;
e94f141b 305
def596e9
JA
306 /*
307 * ->poll_list is protected by the ctx->uring_lock for
308 * io_uring instances that don't use IORING_SETUP_SQPOLL.
309 * For SQPOLL, only the single threaded io_sq_thread() will
310 * manipulate the list, hence no extra locking is needed there.
311 */
312 struct list_head poll_list;
78076bb6
JA
313 struct hlist_head *cancel_hash;
314 unsigned cancel_hash_bits;
e94f141b 315 bool poll_multi_file;
31b51510 316
fcb323cc
JA
317 spinlock_t inflight_lock;
318 struct list_head inflight_list;
2b188cc1 319 } ____cacheline_aligned_in_smp;
2b188cc1
JA
320};
321
09bb8394
JA
322/*
323 * First field must be the file pointer in all the
324 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
325 */
221c5eb2
JA
326struct io_poll_iocb {
327 struct file *file;
0969e783
JA
328 union {
329 struct wait_queue_head *head;
330 u64 addr;
331 };
221c5eb2 332 __poll_t events;
8c838788 333 bool done;
221c5eb2 334 bool canceled;
392edb45 335 struct wait_queue_entry wait;
221c5eb2
JA
336};
337
b5dba59e
JA
338struct io_close {
339 struct file *file;
340 struct file *put_file;
341 int fd;
342};
343
ad8a48ac
JA
344struct io_timeout_data {
345 struct io_kiocb *req;
346 struct hrtimer timer;
347 struct timespec64 ts;
348 enum hrtimer_mode mode;
cc42e0ac 349 u32 seq_offset;
ad8a48ac
JA
350};
351
8ed8d3c3
JA
352struct io_accept {
353 struct file *file;
354 struct sockaddr __user *addr;
355 int __user *addr_len;
356 int flags;
357};
358
359struct io_sync {
360 struct file *file;
361 loff_t len;
362 loff_t off;
363 int flags;
d63d1b5e 364 int mode;
8ed8d3c3
JA
365};
366
fbf23849
JA
367struct io_cancel {
368 struct file *file;
369 u64 addr;
370};
371
b29472ee
JA
372struct io_timeout {
373 struct file *file;
374 u64 addr;
375 int flags;
26a61679 376 unsigned count;
b29472ee
JA
377};
378
9adbd45d
JA
379struct io_rw {
380 /* NOTE: kiocb has the file as the first member, so don't do it here */
381 struct kiocb kiocb;
382 u64 addr;
383 u64 len;
384};
385
3fbb51c1
JA
386struct io_connect {
387 struct file *file;
388 struct sockaddr __user *addr;
389 int addr_len;
390};
391
e47293fd
JA
392struct io_sr_msg {
393 struct file *file;
fddaface
JA
394 union {
395 struct user_msghdr __user *msg;
396 void __user *buf;
397 };
e47293fd 398 int msg_flags;
bcda7baa 399 int bgid;
fddaface 400 size_t len;
bcda7baa 401 struct io_buffer *kbuf;
e47293fd
JA
402};
403
15b71abe
JA
404struct io_open {
405 struct file *file;
406 int dfd;
eddc7ef5 407 union {
eddc7ef5
JA
408 unsigned mask;
409 };
15b71abe 410 struct filename *filename;
eddc7ef5 411 struct statx __user *buffer;
c12cedf2 412 struct open_how how;
15b71abe
JA
413};
414
05f3fb3c
JA
415struct io_files_update {
416 struct file *file;
417 u64 arg;
418 u32 nr_args;
419 u32 offset;
420};
421
4840e418
JA
422struct io_fadvise {
423 struct file *file;
424 u64 offset;
425 u32 len;
426 u32 advice;
427};
428
c1ca757b
JA
429struct io_madvise {
430 struct file *file;
431 u64 addr;
432 u32 len;
433 u32 advice;
434};
435
3e4827b0
JA
436struct io_epoll {
437 struct file *file;
438 int epfd;
439 int op;
440 int fd;
441 struct epoll_event event;
e47293fd
JA
442};
443
7d67af2c
PB
444struct io_splice {
445 struct file *file_out;
446 struct file *file_in;
447 loff_t off_out;
448 loff_t off_in;
449 u64 len;
450 unsigned int flags;
451};
452
ddf0322d
JA
453struct io_provide_buf {
454 struct file *file;
455 __u64 addr;
456 __s32 len;
457 __u32 bgid;
458 __u16 nbufs;
459 __u16 bid;
460};
461
f499a021
JA
462struct io_async_connect {
463 struct sockaddr_storage address;
464};
465
03b1230c
JA
466struct io_async_msghdr {
467 struct iovec fast_iov[UIO_FASTIOV];
468 struct iovec *iov;
469 struct sockaddr __user *uaddr;
470 struct msghdr msg;
b537916c 471 struct sockaddr_storage addr;
03b1230c
JA
472};
473
f67676d1
JA
474struct io_async_rw {
475 struct iovec fast_iov[UIO_FASTIOV];
476 struct iovec *iov;
477 ssize_t nr_segs;
478 ssize_t size;
479};
480
1a6b74fc 481struct io_async_ctx {
f67676d1
JA
482 union {
483 struct io_async_rw rw;
03b1230c 484 struct io_async_msghdr msg;
f499a021 485 struct io_async_connect connect;
2d28390a 486 struct io_timeout_data timeout;
f67676d1 487 };
1a6b74fc
JA
488};
489
6b47ee6e
PB
490enum {
491 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
492 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
493 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
494 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
495 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
bcda7baa 496 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
6b47ee6e
PB
497
498 REQ_F_LINK_NEXT_BIT,
499 REQ_F_FAIL_LINK_BIT,
500 REQ_F_INFLIGHT_BIT,
501 REQ_F_CUR_POS_BIT,
502 REQ_F_NOWAIT_BIT,
503 REQ_F_IOPOLL_COMPLETED_BIT,
504 REQ_F_LINK_TIMEOUT_BIT,
505 REQ_F_TIMEOUT_BIT,
506 REQ_F_ISREG_BIT,
507 REQ_F_MUST_PUNT_BIT,
508 REQ_F_TIMEOUT_NOSEQ_BIT,
509 REQ_F_COMP_LOCKED_BIT,
99bc4c38 510 REQ_F_NEED_CLEANUP_BIT,
2ca10259 511 REQ_F_OVERFLOW_BIT,
d7718a9d 512 REQ_F_POLLED_BIT,
bcda7baa 513 REQ_F_BUFFER_SELECTED_BIT,
84557871
JA
514
515 /* not a real bit, just to check we're not overflowing the space */
516 __REQ_F_LAST_BIT,
6b47ee6e
PB
517};
518
519enum {
520 /* ctx owns file */
521 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
522 /* drain existing IO first */
523 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
524 /* linked sqes */
525 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
526 /* doesn't sever on completion < 0 */
527 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
528 /* IOSQE_ASYNC */
529 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
bcda7baa
JA
530 /* IOSQE_BUFFER_SELECT */
531 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
6b47ee6e
PB
532
533 /* already grabbed next link */
534 REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
535 /* fail rest of links */
536 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
537 /* on inflight list */
538 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
539 /* read/write uses file position */
540 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
541 /* must not punt to workers */
542 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
543 /* polled IO has completed */
544 REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
545 /* has linked timeout */
546 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
547 /* timeout request */
548 REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
549 /* regular file */
550 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
551 /* must be punted even for NONBLOCK */
552 REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
553 /* no timeout sequence */
554 REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
555 /* completion under lock */
556 REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
99bc4c38
PB
557 /* needs cleanup */
558 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
2ca10259
JA
559 /* in overflow list */
560 REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
d7718a9d
JA
561 /* already went through poll handler */
562 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
bcda7baa
JA
563 /* buffer already selected */
564 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
d7718a9d
JA
565};
566
567struct async_poll {
568 struct io_poll_iocb poll;
569 struct io_wq_work work;
6b47ee6e
PB
570};
571
09bb8394
JA
572/*
573 * NOTE! Each of the iocb union members has the file pointer
574 * as the first entry in their struct definition. So you can
575 * access the file pointer through any of the sub-structs,
576 * or directly as just 'ki_filp' in this struct.
577 */
2b188cc1 578struct io_kiocb {
221c5eb2 579 union {
09bb8394 580 struct file *file;
9adbd45d 581 struct io_rw rw;
221c5eb2 582 struct io_poll_iocb poll;
8ed8d3c3
JA
583 struct io_accept accept;
584 struct io_sync sync;
fbf23849 585 struct io_cancel cancel;
b29472ee 586 struct io_timeout timeout;
3fbb51c1 587 struct io_connect connect;
e47293fd 588 struct io_sr_msg sr_msg;
15b71abe 589 struct io_open open;
b5dba59e 590 struct io_close close;
05f3fb3c 591 struct io_files_update files_update;
4840e418 592 struct io_fadvise fadvise;
c1ca757b 593 struct io_madvise madvise;
3e4827b0 594 struct io_epoll epoll;
7d67af2c 595 struct io_splice splice;
ddf0322d 596 struct io_provide_buf pbuf;
221c5eb2 597 };
2b188cc1 598
1a6b74fc 599 struct io_async_ctx *io;
cf6fd4bd 600 bool needs_fixed_file;
d625c6ee 601 u8 opcode;
2b188cc1
JA
602
603 struct io_ring_ctx *ctx;
d7718a9d 604 struct list_head list;
2b188cc1 605 unsigned int flags;
c16361c1 606 refcount_t refs;
d7718a9d 607 struct task_struct *task;
2b188cc1 608 u64 user_data;
9e645e11 609 u32 result;
de0617e4 610 u32 sequence;
2b188cc1 611
d7718a9d
JA
612 struct list_head link_list;
613
fcb323cc
JA
614 struct list_head inflight_entry;
615
b41e9852
JA
616 union {
617 /*
618 * Only commands that never go async can use the below fields,
d7718a9d
JA
619 * obviously. Right now only IORING_OP_POLL_ADD uses them, and
620 * async armed poll handlers for regular commands. The latter
621 * restore the work, if needed.
b41e9852
JA
622 */
623 struct {
b41e9852 624 struct callback_head task_work;
d7718a9d
JA
625 struct hlist_node hash_node;
626 struct async_poll *apoll;
bcda7baa 627 int cflags;
b41e9852
JA
628 };
629 struct io_wq_work work;
630 };
2b188cc1
JA
631};
632
633#define IO_PLUG_THRESHOLD 2
def596e9 634#define IO_IOPOLL_BATCH 8
2b188cc1 635
9a56a232
JA
636struct io_submit_state {
637 struct blk_plug plug;
638
2579f913
JA
639 /*
640 * io_kiocb alloc cache
641 */
642 void *reqs[IO_IOPOLL_BATCH];
6c8a3134 643 unsigned int free_reqs;
2579f913 644
9a56a232
JA
645 /*
646 * File reference cache
647 */
648 struct file *file;
649 unsigned int fd;
650 unsigned int has_refs;
651 unsigned int used_refs;
652 unsigned int ios_left;
653};
654
d3656344
JA
655struct io_op_def {
656 /* needs req->io allocated for deferral/async */
657 unsigned async_ctx : 1;
658 /* needs current->mm setup, does mm access */
659 unsigned needs_mm : 1;
660 /* needs req->file assigned */
661 unsigned needs_file : 1;
662 /* needs req->file assigned IFF fd is >= 0 */
663 unsigned fd_non_neg : 1;
664 /* hash wq insertion if file is a regular file */
665 unsigned hash_reg_file : 1;
666 /* unbound wq insertion if file is a non-regular file */
667 unsigned unbound_nonreg_file : 1;
66f4af93
JA
668 /* opcode is not supported by this kernel */
669 unsigned not_supported : 1;
f86cd20c
JA
670 /* needs file table */
671 unsigned file_table : 1;
ff002b30
JA
672 /* needs ->fs */
673 unsigned needs_fs : 1;
8a72758c
JA
674 /* set if opcode supports polled "wait" */
675 unsigned pollin : 1;
676 unsigned pollout : 1;
bcda7baa
JA
677 /* op supports buffer selection */
678 unsigned buffer_select : 1;
d3656344
JA
679};
680
681static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
682 [IORING_OP_NOP] = {},
683 [IORING_OP_READV] = {
d3656344
JA
684 .async_ctx = 1,
685 .needs_mm = 1,
686 .needs_file = 1,
687 .unbound_nonreg_file = 1,
8a72758c 688 .pollin = 1,
4d954c25 689 .buffer_select = 1,
d3656344 690 },
0463b6c5 691 [IORING_OP_WRITEV] = {
d3656344
JA
692 .async_ctx = 1,
693 .needs_mm = 1,
694 .needs_file = 1,
695 .hash_reg_file = 1,
696 .unbound_nonreg_file = 1,
8a72758c 697 .pollout = 1,
d3656344 698 },
0463b6c5 699 [IORING_OP_FSYNC] = {
d3656344
JA
700 .needs_file = 1,
701 },
0463b6c5 702 [IORING_OP_READ_FIXED] = {
d3656344
JA
703 .needs_file = 1,
704 .unbound_nonreg_file = 1,
8a72758c 705 .pollin = 1,
d3656344 706 },
0463b6c5 707 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
708 .needs_file = 1,
709 .hash_reg_file = 1,
710 .unbound_nonreg_file = 1,
8a72758c 711 .pollout = 1,
d3656344 712 },
0463b6c5 713 [IORING_OP_POLL_ADD] = {
d3656344
JA
714 .needs_file = 1,
715 .unbound_nonreg_file = 1,
716 },
0463b6c5
PB
717 [IORING_OP_POLL_REMOVE] = {},
718 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344
JA
719 .needs_file = 1,
720 },
0463b6c5 721 [IORING_OP_SENDMSG] = {
d3656344
JA
722 .async_ctx = 1,
723 .needs_mm = 1,
724 .needs_file = 1,
725 .unbound_nonreg_file = 1,
ff002b30 726 .needs_fs = 1,
8a72758c 727 .pollout = 1,
d3656344 728 },
0463b6c5 729 [IORING_OP_RECVMSG] = {
d3656344
JA
730 .async_ctx = 1,
731 .needs_mm = 1,
732 .needs_file = 1,
733 .unbound_nonreg_file = 1,
ff002b30 734 .needs_fs = 1,
8a72758c 735 .pollin = 1,
52de1fe1 736 .buffer_select = 1,
d3656344 737 },
0463b6c5 738 [IORING_OP_TIMEOUT] = {
d3656344
JA
739 .async_ctx = 1,
740 .needs_mm = 1,
741 },
0463b6c5
PB
742 [IORING_OP_TIMEOUT_REMOVE] = {},
743 [IORING_OP_ACCEPT] = {
d3656344
JA
744 .needs_mm = 1,
745 .needs_file = 1,
746 .unbound_nonreg_file = 1,
f86cd20c 747 .file_table = 1,
8a72758c 748 .pollin = 1,
d3656344 749 },
0463b6c5
PB
750 [IORING_OP_ASYNC_CANCEL] = {},
751 [IORING_OP_LINK_TIMEOUT] = {
d3656344
JA
752 .async_ctx = 1,
753 .needs_mm = 1,
754 },
0463b6c5 755 [IORING_OP_CONNECT] = {
d3656344
JA
756 .async_ctx = 1,
757 .needs_mm = 1,
758 .needs_file = 1,
759 .unbound_nonreg_file = 1,
8a72758c 760 .pollout = 1,
d3656344 761 },
0463b6c5 762 [IORING_OP_FALLOCATE] = {
d3656344
JA
763 .needs_file = 1,
764 },
0463b6c5 765 [IORING_OP_OPENAT] = {
d3656344
JA
766 .needs_file = 1,
767 .fd_non_neg = 1,
f86cd20c 768 .file_table = 1,
ff002b30 769 .needs_fs = 1,
d3656344 770 },
0463b6c5 771 [IORING_OP_CLOSE] = {
d3656344 772 .needs_file = 1,
f86cd20c 773 .file_table = 1,
d3656344 774 },
0463b6c5 775 [IORING_OP_FILES_UPDATE] = {
d3656344 776 .needs_mm = 1,
f86cd20c 777 .file_table = 1,
d3656344 778 },
0463b6c5 779 [IORING_OP_STATX] = {
d3656344
JA
780 .needs_mm = 1,
781 .needs_file = 1,
782 .fd_non_neg = 1,
ff002b30 783 .needs_fs = 1,
d3656344 784 },
0463b6c5 785 [IORING_OP_READ] = {
3a6820f2
JA
786 .needs_mm = 1,
787 .needs_file = 1,
788 .unbound_nonreg_file = 1,
8a72758c 789 .pollin = 1,
bcda7baa 790 .buffer_select = 1,
3a6820f2 791 },
0463b6c5 792 [IORING_OP_WRITE] = {
3a6820f2
JA
793 .needs_mm = 1,
794 .needs_file = 1,
795 .unbound_nonreg_file = 1,
8a72758c 796 .pollout = 1,
3a6820f2 797 },
0463b6c5 798 [IORING_OP_FADVISE] = {
4840e418
JA
799 .needs_file = 1,
800 },
0463b6c5 801 [IORING_OP_MADVISE] = {
c1ca757b
JA
802 .needs_mm = 1,
803 },
0463b6c5 804 [IORING_OP_SEND] = {
fddaface
JA
805 .needs_mm = 1,
806 .needs_file = 1,
807 .unbound_nonreg_file = 1,
8a72758c 808 .pollout = 1,
fddaface 809 },
0463b6c5 810 [IORING_OP_RECV] = {
fddaface
JA
811 .needs_mm = 1,
812 .needs_file = 1,
813 .unbound_nonreg_file = 1,
8a72758c 814 .pollin = 1,
bcda7baa 815 .buffer_select = 1,
fddaface 816 },
0463b6c5 817 [IORING_OP_OPENAT2] = {
cebdb986
JA
818 .needs_file = 1,
819 .fd_non_neg = 1,
f86cd20c 820 .file_table = 1,
ff002b30 821 .needs_fs = 1,
cebdb986 822 },
3e4827b0
JA
823 [IORING_OP_EPOLL_CTL] = {
824 .unbound_nonreg_file = 1,
825 .file_table = 1,
826 },
7d67af2c
PB
827 [IORING_OP_SPLICE] = {
828 .needs_file = 1,
829 .hash_reg_file = 1,
830 .unbound_nonreg_file = 1,
ddf0322d
JA
831 },
832 [IORING_OP_PROVIDE_BUFFERS] = {},
067524e9 833 [IORING_OP_REMOVE_BUFFERS] = {},
d3656344
JA
834};
835
561fb04a 836static void io_wq_submit_work(struct io_wq_work **workptr);
78e19bbe 837static void io_cqring_fill_event(struct io_kiocb *req, long res);
ec9c02ad 838static void io_put_req(struct io_kiocb *req);
978db57e 839static void __io_double_put_req(struct io_kiocb *req);
94ae5e77
JA
840static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
841static void io_queue_linked_timeout(struct io_kiocb *req);
05f3fb3c
JA
842static int __io_sqe_files_update(struct io_ring_ctx *ctx,
843 struct io_uring_files_update *ip,
844 unsigned nr_args);
f86cd20c 845static int io_grab_files(struct io_kiocb *req);
2faf852d 846static void io_ring_file_ref_flush(struct fixed_file_data *data);
99bc4c38 847static void io_cleanup_req(struct io_kiocb *req);
b41e9852
JA
848static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
849 int fd, struct file **out_file, bool fixed);
850static void __io_queue_sqe(struct io_kiocb *req,
851 const struct io_uring_sqe *sqe);
de0617e4 852
2b188cc1
JA
853static struct kmem_cache *req_cachep;
854
855static const struct file_operations io_uring_fops;
856
857struct sock *io_uring_get_socket(struct file *file)
858{
859#if defined(CONFIG_UNIX)
860 if (file->f_op == &io_uring_fops) {
861 struct io_ring_ctx *ctx = file->private_data;
862
863 return ctx->ring_sock->sk;
864 }
865#endif
866 return NULL;
867}
868EXPORT_SYMBOL(io_uring_get_socket);
869
870static void io_ring_ctx_ref_free(struct percpu_ref *ref)
871{
872 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
873
206aefde 874 complete(&ctx->completions[0]);
2b188cc1
JA
875}
876
877static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
878{
879 struct io_ring_ctx *ctx;
78076bb6 880 int hash_bits;
2b188cc1
JA
881
882 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
883 if (!ctx)
884 return NULL;
885
0ddf92e8
JA
886 ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
887 if (!ctx->fallback_req)
888 goto err;
889
206aefde
JA
890 ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
891 if (!ctx->completions)
892 goto err;
893
78076bb6
JA
894 /*
895 * Use 5 bits less than the max cq entries, that should give us around
896 * 32 entries per hash list if totally full and uniformly spread.
897 */
898 hash_bits = ilog2(p->cq_entries);
899 hash_bits -= 5;
900 if (hash_bits <= 0)
901 hash_bits = 1;
902 ctx->cancel_hash_bits = hash_bits;
903 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
904 GFP_KERNEL);
905 if (!ctx->cancel_hash)
906 goto err;
907 __hash_init(ctx->cancel_hash, 1U << hash_bits);
908
21482896 909 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
910 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
911 goto err;
2b188cc1
JA
912
913 ctx->flags = p->flags;
914 init_waitqueue_head(&ctx->cq_wait);
1d7bb1d5 915 INIT_LIST_HEAD(&ctx->cq_overflow_list);
206aefde
JA
916 init_completion(&ctx->completions[0]);
917 init_completion(&ctx->completions[1]);
5a2e745d 918 idr_init(&ctx->io_buffer_idr);
071698e1 919 idr_init(&ctx->personality_idr);
2b188cc1
JA
920 mutex_init(&ctx->uring_lock);
921 init_waitqueue_head(&ctx->wait);
922 spin_lock_init(&ctx->completion_lock);
def596e9 923 INIT_LIST_HEAD(&ctx->poll_list);
de0617e4 924 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 925 INIT_LIST_HEAD(&ctx->timeout_list);
fcb323cc
JA
926 init_waitqueue_head(&ctx->inflight_wait);
927 spin_lock_init(&ctx->inflight_lock);
928 INIT_LIST_HEAD(&ctx->inflight_list);
2b188cc1 929 return ctx;
206aefde 930err:
0ddf92e8
JA
931 if (ctx->fallback_req)
932 kmem_cache_free(req_cachep, ctx->fallback_req);
206aefde 933 kfree(ctx->completions);
78076bb6 934 kfree(ctx->cancel_hash);
206aefde
JA
935 kfree(ctx);
936 return NULL;
2b188cc1
JA
937}
938
9d858b21 939static inline bool __req_need_defer(struct io_kiocb *req)
7adf4eaf 940{
a197f664
JL
941 struct io_ring_ctx *ctx = req->ctx;
942
498ccd9e
JA
943 return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
944 + atomic_read(&ctx->cached_cq_overflow);
7adf4eaf
JA
945}
946
9d858b21 947static inline bool req_need_defer(struct io_kiocb *req)
de0617e4 948{
87987898 949 if (unlikely(req->flags & REQ_F_IO_DRAIN))
9d858b21 950 return __req_need_defer(req);
de0617e4 951
9d858b21 952 return false;
de0617e4
JA
953}
954
7adf4eaf 955static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
de0617e4
JA
956{
957 struct io_kiocb *req;
958
7adf4eaf 959 req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
9d858b21 960 if (req && !req_need_defer(req)) {
de0617e4
JA
961 list_del_init(&req->list);
962 return req;
963 }
964
965 return NULL;
966}
967
5262f567
JA
968static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
969{
7adf4eaf
JA
970 struct io_kiocb *req;
971
972 req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
93bd25bb
JA
973 if (req) {
974 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
975 return NULL;
fb4b3d3f 976 if (!__req_need_defer(req)) {
93bd25bb
JA
977 list_del_init(&req->list);
978 return req;
979 }
7adf4eaf
JA
980 }
981
982 return NULL;
5262f567
JA
983}
984
de0617e4 985static void __io_commit_cqring(struct io_ring_ctx *ctx)
2b188cc1 986{
75b28aff 987 struct io_rings *rings = ctx->rings;
2b188cc1 988
07910158
PB
989 /* order cqe stores with ring update */
990 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
2b188cc1 991
07910158
PB
992 if (wq_has_sleeper(&ctx->cq_wait)) {
993 wake_up_interruptible(&ctx->cq_wait);
994 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
2b188cc1
JA
995 }
996}
997
cccf0ee8
JA
998static inline void io_req_work_grab_env(struct io_kiocb *req,
999 const struct io_op_def *def)
1000{
1001 if (!req->work.mm && def->needs_mm) {
1002 mmgrab(current->mm);
1003 req->work.mm = current->mm;
2b188cc1 1004 }
cccf0ee8
JA
1005 if (!req->work.creds)
1006 req->work.creds = get_current_cred();
ff002b30
JA
1007 if (!req->work.fs && def->needs_fs) {
1008 spin_lock(&current->fs->lock);
1009 if (!current->fs->in_exec) {
1010 req->work.fs = current->fs;
1011 req->work.fs->users++;
1012 } else {
1013 req->work.flags |= IO_WQ_WORK_CANCEL;
1014 }
1015 spin_unlock(&current->fs->lock);
1016 }
6ab23144
JA
1017 if (!req->work.task_pid)
1018 req->work.task_pid = task_pid_vnr(current);
2b188cc1
JA
1019}
1020
cccf0ee8 1021static inline void io_req_work_drop_env(struct io_kiocb *req)
18d9be1a 1022{
cccf0ee8
JA
1023 if (req->work.mm) {
1024 mmdrop(req->work.mm);
1025 req->work.mm = NULL;
1026 }
1027 if (req->work.creds) {
1028 put_cred(req->work.creds);
1029 req->work.creds = NULL;
1030 }
ff002b30
JA
1031 if (req->work.fs) {
1032 struct fs_struct *fs = req->work.fs;
1033
1034 spin_lock(&req->work.fs->lock);
1035 if (--fs->users)
1036 fs = NULL;
1037 spin_unlock(&req->work.fs->lock);
1038 if (fs)
1039 free_fs_struct(fs);
1040 }
561fb04a
JA
1041}
1042
94ae5e77
JA
1043static inline bool io_prep_async_work(struct io_kiocb *req,
1044 struct io_kiocb **link)
18d9be1a 1045{
d3656344 1046 const struct io_op_def *def = &io_op_defs[req->opcode];
561fb04a 1047 bool do_hashed = false;
54a91f3b 1048
d3656344
JA
1049 if (req->flags & REQ_F_ISREG) {
1050 if (def->hash_reg_file)
3529d8c2 1051 do_hashed = true;
d3656344
JA
1052 } else {
1053 if (def->unbound_nonreg_file)
3529d8c2 1054 req->work.flags |= IO_WQ_WORK_UNBOUND;
54a91f3b 1055 }
cccf0ee8
JA
1056
1057 io_req_work_grab_env(req, def);
54a91f3b 1058
94ae5e77 1059 *link = io_prep_linked_timeout(req);
561fb04a
JA
1060 return do_hashed;
1061}
1062
a197f664 1063static inline void io_queue_async_work(struct io_kiocb *req)
561fb04a 1064{
a197f664 1065 struct io_ring_ctx *ctx = req->ctx;
94ae5e77
JA
1066 struct io_kiocb *link;
1067 bool do_hashed;
1068
1069 do_hashed = io_prep_async_work(req, &link);
561fb04a
JA
1070
1071 trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
1072 req->flags);
1073 if (!do_hashed) {
1074 io_wq_enqueue(ctx->io_wq, &req->work);
1075 } else {
1076 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
1077 file_inode(req->file));
1078 }
94ae5e77
JA
1079
1080 if (link)
1081 io_queue_linked_timeout(link);
18d9be1a
JA
1082}
1083
5262f567
JA
1084static void io_kill_timeout(struct io_kiocb *req)
1085{
1086 int ret;
1087
2d28390a 1088 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
5262f567
JA
1089 if (ret != -1) {
1090 atomic_inc(&req->ctx->cq_timeouts);
842f9612 1091 list_del_init(&req->list);
78e19bbe 1092 io_cqring_fill_event(req, 0);
ec9c02ad 1093 io_put_req(req);
5262f567
JA
1094 }
1095}
1096
1097static void io_kill_timeouts(struct io_ring_ctx *ctx)
1098{
1099 struct io_kiocb *req, *tmp;
1100
1101 spin_lock_irq(&ctx->completion_lock);
1102 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1103 io_kill_timeout(req);
1104 spin_unlock_irq(&ctx->completion_lock);
1105}
1106
de0617e4
JA
1107static void io_commit_cqring(struct io_ring_ctx *ctx)
1108{
1109 struct io_kiocb *req;
1110
5262f567
JA
1111 while ((req = io_get_timeout_req(ctx)) != NULL)
1112 io_kill_timeout(req);
1113
de0617e4
JA
1114 __io_commit_cqring(ctx);
1115
87987898 1116 while ((req = io_get_deferred_req(ctx)) != NULL)
a197f664 1117 io_queue_async_work(req);
de0617e4
JA
1118}
1119
2b188cc1
JA
1120static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1121{
75b28aff 1122 struct io_rings *rings = ctx->rings;
2b188cc1
JA
1123 unsigned tail;
1124
1125 tail = ctx->cached_cq_tail;
115e12e5
SB
1126 /*
1127 * writes to the cq entry need to come after reading head; the
1128 * control dependency is enough as we're using WRITE_ONCE to
1129 * fill the cq entry
1130 */
75b28aff 1131 if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
2b188cc1
JA
1132 return NULL;
1133
1134 ctx->cached_cq_tail++;
75b28aff 1135 return &rings->cqes[tail & ctx->cq_mask];
2b188cc1
JA
1136}
1137
f2842ab5
JA
1138static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1139{
f0b493e6
JA
1140 if (!ctx->cq_ev_fd)
1141 return false;
f2842ab5
JA
1142 if (!ctx->eventfd_async)
1143 return true;
b41e9852 1144 return io_wq_current_is_worker();
f2842ab5
JA
1145}
1146
b41e9852 1147static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5
JA
1148{
1149 if (waitqueue_active(&ctx->wait))
1150 wake_up(&ctx->wait);
1151 if (waitqueue_active(&ctx->sqo_wait))
1152 wake_up(&ctx->sqo_wait);
b41e9852 1153 if (io_should_trigger_evfd(ctx))
1d7bb1d5
JA
1154 eventfd_signal(ctx->cq_ev_fd, 1);
1155}
1156
c4a2ed72
JA
1157/* Returns true if there are no backlogged entries after the flush */
1158static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5
JA
1159{
1160 struct io_rings *rings = ctx->rings;
1161 struct io_uring_cqe *cqe;
1162 struct io_kiocb *req;
1163 unsigned long flags;
1164 LIST_HEAD(list);
1165
1166 if (!force) {
1167 if (list_empty_careful(&ctx->cq_overflow_list))
c4a2ed72 1168 return true;
1d7bb1d5
JA
1169 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1170 rings->cq_ring_entries))
c4a2ed72 1171 return false;
1d7bb1d5
JA
1172 }
1173
1174 spin_lock_irqsave(&ctx->completion_lock, flags);
1175
1176 /* if force is set, the ring is going away. always drop after that */
1177 if (force)
69b3e546 1178 ctx->cq_overflow_flushed = 1;
1d7bb1d5 1179
c4a2ed72 1180 cqe = NULL;
1d7bb1d5
JA
1181 while (!list_empty(&ctx->cq_overflow_list)) {
1182 cqe = io_get_cqring(ctx);
1183 if (!cqe && !force)
1184 break;
1185
1186 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1187 list);
1188 list_move(&req->list, &list);
2ca10259 1189 req->flags &= ~REQ_F_OVERFLOW;
1d7bb1d5
JA
1190 if (cqe) {
1191 WRITE_ONCE(cqe->user_data, req->user_data);
1192 WRITE_ONCE(cqe->res, req->result);
bcda7baa 1193 WRITE_ONCE(cqe->flags, req->cflags);
1d7bb1d5
JA
1194 } else {
1195 WRITE_ONCE(ctx->rings->cq_overflow,
1196 atomic_inc_return(&ctx->cached_cq_overflow));
1197 }
1198 }
1199
1200 io_commit_cqring(ctx);
ad3eb2c8
JA
1201 if (cqe) {
1202 clear_bit(0, &ctx->sq_check_overflow);
1203 clear_bit(0, &ctx->cq_check_overflow);
1204 }
1d7bb1d5
JA
1205 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1206 io_cqring_ev_posted(ctx);
1207
1208 while (!list_empty(&list)) {
1209 req = list_first_entry(&list, struct io_kiocb, list);
1210 list_del(&req->list);
ec9c02ad 1211 io_put_req(req);
1d7bb1d5 1212 }
c4a2ed72
JA
1213
1214 return cqe != NULL;
1d7bb1d5
JA
1215}
1216
bcda7baa 1217static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
2b188cc1 1218{
78e19bbe 1219 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1220 struct io_uring_cqe *cqe;
1221
78e19bbe 1222 trace_io_uring_complete(ctx, req->user_data, res);
51c3ff62 1223
2b188cc1
JA
1224 /*
1225 * If we can't get a cq entry, userspace overflowed the
1226 * submission (by quite a lot). Increment the overflow count in
1227 * the ring.
1228 */
1229 cqe = io_get_cqring(ctx);
1d7bb1d5 1230 if (likely(cqe)) {
78e19bbe 1231 WRITE_ONCE(cqe->user_data, req->user_data);
2b188cc1 1232 WRITE_ONCE(cqe->res, res);
bcda7baa 1233 WRITE_ONCE(cqe->flags, cflags);
1d7bb1d5 1234 } else if (ctx->cq_overflow_flushed) {
498ccd9e
JA
1235 WRITE_ONCE(ctx->rings->cq_overflow,
1236 atomic_inc_return(&ctx->cached_cq_overflow));
1d7bb1d5 1237 } else {
ad3eb2c8
JA
1238 if (list_empty(&ctx->cq_overflow_list)) {
1239 set_bit(0, &ctx->sq_check_overflow);
1240 set_bit(0, &ctx->cq_check_overflow);
1241 }
2ca10259 1242 req->flags |= REQ_F_OVERFLOW;
1d7bb1d5
JA
1243 refcount_inc(&req->refs);
1244 req->result = res;
bcda7baa 1245 req->cflags = cflags;
1d7bb1d5 1246 list_add_tail(&req->list, &ctx->cq_overflow_list);
2b188cc1
JA
1247 }
1248}
1249
bcda7baa
JA
1250static void io_cqring_fill_event(struct io_kiocb *req, long res)
1251{
1252 __io_cqring_fill_event(req, res, 0);
1253}
1254
1255static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
2b188cc1 1256{
78e19bbe 1257 struct io_ring_ctx *ctx = req->ctx;
2b188cc1
JA
1258 unsigned long flags;
1259
1260 spin_lock_irqsave(&ctx->completion_lock, flags);
bcda7baa 1261 __io_cqring_fill_event(req, res, cflags);
2b188cc1
JA
1262 io_commit_cqring(ctx);
1263 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1264
8c838788 1265 io_cqring_ev_posted(ctx);
2b188cc1
JA
1266}
1267
bcda7baa
JA
1268static void io_cqring_add_event(struct io_kiocb *req, long res)
1269{
1270 __io_cqring_add_event(req, res, 0);
1271}
1272
0ddf92e8
JA
1273static inline bool io_is_fallback_req(struct io_kiocb *req)
1274{
1275 return req == (struct io_kiocb *)
1276 ((unsigned long) req->ctx->fallback_req & ~1UL);
1277}
1278
1279static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1280{
1281 struct io_kiocb *req;
1282
1283 req = ctx->fallback_req;
1284 if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
1285 return req;
1286
1287 return NULL;
1288}
1289
2579f913
JA
1290static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
1291 struct io_submit_state *state)
2b188cc1 1292{
fd6fab2c 1293 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2b188cc1
JA
1294 struct io_kiocb *req;
1295
2579f913 1296 if (!state) {
fd6fab2c 1297 req = kmem_cache_alloc(req_cachep, gfp);
2579f913 1298 if (unlikely(!req))
0ddf92e8 1299 goto fallback;
2579f913
JA
1300 } else if (!state->free_reqs) {
1301 size_t sz;
1302 int ret;
1303
1304 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
fd6fab2c
JA
1305 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1306
1307 /*
1308 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1309 * retry single alloc to be on the safe side.
1310 */
1311 if (unlikely(ret <= 0)) {
1312 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1313 if (!state->reqs[0])
0ddf92e8 1314 goto fallback;
fd6fab2c
JA
1315 ret = 1;
1316 }
2579f913 1317 state->free_reqs = ret - 1;
6c8a3134 1318 req = state->reqs[ret - 1];
2579f913 1319 } else {
2579f913 1320 state->free_reqs--;
6c8a3134 1321 req = state->reqs[state->free_reqs];
2b188cc1
JA
1322 }
1323
0ddf92e8 1324got_it:
1a6b74fc 1325 req->io = NULL;
60c112b0 1326 req->file = NULL;
2579f913
JA
1327 req->ctx = ctx;
1328 req->flags = 0;
e65ef56d
JA
1329 /* one is dropped after submission, the other at completion */
1330 refcount_set(&req->refs, 2);
9e645e11 1331 req->result = 0;
561fb04a 1332 INIT_IO_WORK(&req->work, io_wq_submit_work);
2579f913 1333 return req;
0ddf92e8
JA
1334fallback:
1335 req = io_get_fallback_req(ctx);
1336 if (req)
1337 goto got_it;
6805b32e 1338 percpu_ref_put(&ctx->refs);
2b188cc1
JA
1339 return NULL;
1340}
1341
8da11c19
PB
1342static inline void io_put_file(struct io_kiocb *req, struct file *file,
1343 bool fixed)
1344{
1345 if (fixed)
1346 percpu_ref_put(&req->ctx->file_data->refs);
1347 else
1348 fput(file);
1349}
1350
2b85edfc 1351static void __io_req_do_free(struct io_kiocb *req)
def596e9 1352{
2b85edfc
PB
1353 if (likely(!io_is_fallback_req(req)))
1354 kmem_cache_free(req_cachep, req);
1355 else
1356 clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
1357}
1358
c6ca97b3 1359static void __io_req_aux_free(struct io_kiocb *req)
2b188cc1 1360{
929a3af9
PB
1361 if (req->flags & REQ_F_NEED_CLEANUP)
1362 io_cleanup_req(req);
1363
96fd84d8 1364 kfree(req->io);
8da11c19
PB
1365 if (req->file)
1366 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
cccf0ee8
JA
1367
1368 io_req_work_drop_env(req);
def596e9
JA
1369}
1370
9e645e11 1371static void __io_free_req(struct io_kiocb *req)
2b188cc1 1372{
c6ca97b3 1373 __io_req_aux_free(req);
fcb323cc 1374
fcb323cc 1375 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3 1376 struct io_ring_ctx *ctx = req->ctx;
fcb323cc
JA
1377 unsigned long flags;
1378
1379 spin_lock_irqsave(&ctx->inflight_lock, flags);
1380 list_del(&req->inflight_entry);
1381 if (waitqueue_active(&ctx->inflight_wait))
1382 wake_up(&ctx->inflight_wait);
1383 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1384 }
2b85edfc
PB
1385
1386 percpu_ref_put(&req->ctx->refs);
1387 __io_req_do_free(req);
e65ef56d
JA
1388}
1389
c6ca97b3
JA
1390struct req_batch {
1391 void *reqs[IO_IOPOLL_BATCH];
1392 int to_free;
1393 int need_iter;
1394};
1395
1396static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1397{
10fef4be
JA
1398 int fixed_refs = rb->to_free;
1399
c6ca97b3
JA
1400 if (!rb->to_free)
1401 return;
1402 if (rb->need_iter) {
1403 int i, inflight = 0;
1404 unsigned long flags;
1405
10fef4be 1406 fixed_refs = 0;
c6ca97b3
JA
1407 for (i = 0; i < rb->to_free; i++) {
1408 struct io_kiocb *req = rb->reqs[i];
1409
10fef4be 1410 if (req->flags & REQ_F_FIXED_FILE) {
c6ca97b3 1411 req->file = NULL;
10fef4be
JA
1412 fixed_refs++;
1413 }
c6ca97b3
JA
1414 if (req->flags & REQ_F_INFLIGHT)
1415 inflight++;
c6ca97b3
JA
1416 __io_req_aux_free(req);
1417 }
1418 if (!inflight)
1419 goto do_free;
1420
1421 spin_lock_irqsave(&ctx->inflight_lock, flags);
1422 for (i = 0; i < rb->to_free; i++) {
1423 struct io_kiocb *req = rb->reqs[i];
1424
10fef4be 1425 if (req->flags & REQ_F_INFLIGHT) {
c6ca97b3
JA
1426 list_del(&req->inflight_entry);
1427 if (!--inflight)
1428 break;
1429 }
1430 }
1431 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1432
1433 if (waitqueue_active(&ctx->inflight_wait))
1434 wake_up(&ctx->inflight_wait);
1435 }
1436do_free:
1437 kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
10fef4be
JA
1438 if (fixed_refs)
1439 percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
c6ca97b3 1440 percpu_ref_put_many(&ctx->refs, rb->to_free);
c6ca97b3 1441 rb->to_free = rb->need_iter = 0;
e65ef56d
JA
1442}
1443
a197f664 1444static bool io_link_cancel_timeout(struct io_kiocb *req)
2665abfd 1445{
a197f664 1446 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1447 int ret;
1448
2d28390a 1449 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
2665abfd 1450 if (ret != -1) {
78e19bbe 1451 io_cqring_fill_event(req, -ECANCELED);
2665abfd
JA
1452 io_commit_cqring(ctx);
1453 req->flags &= ~REQ_F_LINK;
ec9c02ad 1454 io_put_req(req);
2665abfd
JA
1455 return true;
1456 }
1457
1458 return false;
e65ef56d
JA
1459}
1460
ba816ad6 1461static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
9e645e11 1462{
2665abfd 1463 struct io_ring_ctx *ctx = req->ctx;
2665abfd 1464 bool wake_ev = false;
9e645e11 1465
4d7dd462
JA
1466 /* Already got next link */
1467 if (req->flags & REQ_F_LINK_NEXT)
1468 return;
1469
9e645e11
JA
1470 /*
1471 * The list should never be empty when we are called here. But could
1472 * potentially happen if the chain is messed up, check to be on the
1473 * safe side.
1474 */
4493233e
PB
1475 while (!list_empty(&req->link_list)) {
1476 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1477 struct io_kiocb, link_list);
94ae5e77 1478
4493233e
PB
1479 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1480 (nxt->flags & REQ_F_TIMEOUT))) {
1481 list_del_init(&nxt->link_list);
94ae5e77 1482 wake_ev |= io_link_cancel_timeout(nxt);
94ae5e77
JA
1483 req->flags &= ~REQ_F_LINK_TIMEOUT;
1484 continue;
1485 }
9e645e11 1486
4493233e
PB
1487 list_del_init(&req->link_list);
1488 if (!list_empty(&nxt->link_list))
1489 nxt->flags |= REQ_F_LINK;
b18fdf71 1490 *nxtptr = nxt;
94ae5e77 1491 break;
9e645e11 1492 }
2665abfd 1493
4d7dd462 1494 req->flags |= REQ_F_LINK_NEXT;
2665abfd
JA
1495 if (wake_ev)
1496 io_cqring_ev_posted(ctx);
9e645e11
JA
1497}
1498
1499/*
1500 * Called if REQ_F_LINK is set, and we fail the head request
1501 */
1502static void io_fail_links(struct io_kiocb *req)
1503{
2665abfd 1504 struct io_ring_ctx *ctx = req->ctx;
2665abfd
JA
1505 unsigned long flags;
1506
1507 spin_lock_irqsave(&ctx->completion_lock, flags);
9e645e11
JA
1508
1509 while (!list_empty(&req->link_list)) {
4493233e
PB
1510 struct io_kiocb *link = list_first_entry(&req->link_list,
1511 struct io_kiocb, link_list);
9e645e11 1512
4493233e 1513 list_del_init(&link->link_list);
c826bd7a 1514 trace_io_uring_fail_link(req, link);
2665abfd
JA
1515
1516 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
d625c6ee 1517 link->opcode == IORING_OP_LINK_TIMEOUT) {
a197f664 1518 io_link_cancel_timeout(link);
2665abfd 1519 } else {
78e19bbe 1520 io_cqring_fill_event(link, -ECANCELED);
978db57e 1521 __io_double_put_req(link);
2665abfd 1522 }
5d960724 1523 req->flags &= ~REQ_F_LINK_TIMEOUT;
9e645e11 1524 }
2665abfd
JA
1525
1526 io_commit_cqring(ctx);
1527 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1528 io_cqring_ev_posted(ctx);
9e645e11
JA
1529}
1530
4d7dd462 1531static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
9e645e11 1532{
4d7dd462 1533 if (likely(!(req->flags & REQ_F_LINK)))
2665abfd 1534 return;
2665abfd 1535
9e645e11
JA
1536 /*
1537 * If LINK is set, we have dependent requests in this chain. If we
1538 * didn't fail this request, queue the first one up, moving any other
1539 * dependencies to the next request. In case of failure, fail the rest
1540 * of the chain.
1541 */
2665abfd
JA
1542 if (req->flags & REQ_F_FAIL_LINK) {
1543 io_fail_links(req);
7c9e7f0f
JA
1544 } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1545 REQ_F_LINK_TIMEOUT) {
2665abfd
JA
1546 struct io_ring_ctx *ctx = req->ctx;
1547 unsigned long flags;
1548
1549 /*
1550 * If this is a timeout link, we could be racing with the
1551 * timeout timer. Grab the completion lock for this case to
7c9e7f0f 1552 * protect against that.
2665abfd
JA
1553 */
1554 spin_lock_irqsave(&ctx->completion_lock, flags);
1555 io_req_link_next(req, nxt);
1556 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1557 } else {
1558 io_req_link_next(req, nxt);
9e645e11 1559 }
4d7dd462 1560}
9e645e11 1561
c69f8dbe
JL
1562static void io_free_req(struct io_kiocb *req)
1563{
944e58bf
PB
1564 struct io_kiocb *nxt = NULL;
1565
1566 io_req_find_next(req, &nxt);
70cf9f32 1567 __io_free_req(req);
944e58bf
PB
1568
1569 if (nxt)
1570 io_queue_async_work(nxt);
c69f8dbe
JL
1571}
1572
7a743e22
PB
1573static void io_link_work_cb(struct io_wq_work **workptr)
1574{
1575 struct io_wq_work *work = *workptr;
1576 struct io_kiocb *link = work->data;
1577
1578 io_queue_linked_timeout(link);
1579 io_wq_submit_work(workptr);
1580}
1581
1582static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
1583{
1584 struct io_kiocb *link;
1585
1586 *workptr = &nxt->work;
1587 link = io_prep_linked_timeout(nxt);
1588 if (link) {
1589 nxt->work.func = io_link_work_cb;
1590 nxt->work.data = link;
1591 }
1592}
1593
ba816ad6
JA
1594/*
1595 * Drop reference to request, return next in chain (if there is one) if this
1596 * was the last reference to this request.
1597 */
f9bd67f6 1598__attribute__((nonnull))
ec9c02ad 1599static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
e65ef56d 1600{
2a44f467
JA
1601 if (refcount_dec_and_test(&req->refs)) {
1602 io_req_find_next(req, nxtptr);
4d7dd462 1603 __io_free_req(req);
2a44f467 1604 }
2b188cc1
JA
1605}
1606
e65ef56d
JA
1607static void io_put_req(struct io_kiocb *req)
1608{
1609 if (refcount_dec_and_test(&req->refs))
1610 io_free_req(req);
2b188cc1
JA
1611}
1612
e9fd9396
PB
1613static void io_steal_work(struct io_kiocb *req,
1614 struct io_wq_work **workptr)
7a743e22
PB
1615{
1616 /*
1617 * It's in an io-wq worker, so there always should be at least
1618 * one reference, which will be dropped in io_put_work() just
1619 * after the current handler returns.
1620 *
1621 * It also means, that if the counter dropped to 1, then there is
1622 * no asynchronous users left, so it's safe to steal the next work.
1623 */
7a743e22
PB
1624 if (refcount_read(&req->refs) == 1) {
1625 struct io_kiocb *nxt = NULL;
1626
1627 io_req_find_next(req, &nxt);
1628 if (nxt)
1629 io_wq_assign_next(workptr, nxt);
1630 }
1631}
1632
978db57e
JA
1633/*
1634 * Must only be used if we don't need to care about links, usually from
1635 * within the completion handling itself.
1636 */
1637static void __io_double_put_req(struct io_kiocb *req)
78e19bbe
JA
1638{
1639 /* drop both submit and complete references */
1640 if (refcount_sub_and_test(2, &req->refs))
1641 __io_free_req(req);
1642}
1643
978db57e
JA
1644static void io_double_put_req(struct io_kiocb *req)
1645{
1646 /* drop both submit and complete references */
1647 if (refcount_sub_and_test(2, &req->refs))
1648 io_free_req(req);
1649}
1650
1d7bb1d5 1651static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
a3a0e43f 1652{
84f97dc2
JA
1653 struct io_rings *rings = ctx->rings;
1654
ad3eb2c8
JA
1655 if (test_bit(0, &ctx->cq_check_overflow)) {
1656 /*
1657 * noflush == true is from the waitqueue handler, just ensure
1658 * we wake up the task, and the next invocation will flush the
1659 * entries. We cannot safely to it from here.
1660 */
1661 if (noflush && !list_empty(&ctx->cq_overflow_list))
1662 return -1U;
1d7bb1d5 1663
ad3eb2c8
JA
1664 io_cqring_overflow_flush(ctx, false);
1665 }
1d7bb1d5 1666
a3a0e43f
JA
1667 /* See comment at the top of this file */
1668 smp_rmb();
ad3eb2c8 1669 return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
a3a0e43f
JA
1670}
1671
fb5ccc98
PB
1672static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1673{
1674 struct io_rings *rings = ctx->rings;
1675
1676 /* make sure SQ entry isn't read before tail */
1677 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1678}
1679
8237e045 1680static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
e94f141b 1681{
c6ca97b3
JA
1682 if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
1683 return false;
e94f141b 1684
c6ca97b3
JA
1685 if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
1686 rb->need_iter++;
1687
1688 rb->reqs[rb->to_free++] = req;
1689 if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1690 io_free_req_many(req->ctx, rb);
1691 return true;
e94f141b
JA
1692}
1693
bcda7baa
JA
1694static int io_put_kbuf(struct io_kiocb *req)
1695{
4d954c25 1696 struct io_buffer *kbuf;
bcda7baa
JA
1697 int cflags;
1698
4d954c25 1699 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
bcda7baa
JA
1700 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
1701 cflags |= IORING_CQE_F_BUFFER;
1702 req->rw.addr = 0;
1703 kfree(kbuf);
1704 return cflags;
1705}
1706
def596e9
JA
1707/*
1708 * Find and free completed poll iocbs
1709 */
1710static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1711 struct list_head *done)
1712{
8237e045 1713 struct req_batch rb;
def596e9 1714 struct io_kiocb *req;
def596e9 1715
c6ca97b3 1716 rb.to_free = rb.need_iter = 0;
def596e9 1717 while (!list_empty(done)) {
bcda7baa
JA
1718 int cflags = 0;
1719
def596e9
JA
1720 req = list_first_entry(done, struct io_kiocb, list);
1721 list_del(&req->list);
1722
bcda7baa
JA
1723 if (req->flags & REQ_F_BUFFER_SELECTED)
1724 cflags = io_put_kbuf(req);
1725
1726 __io_cqring_fill_event(req, req->result, cflags);
def596e9
JA
1727 (*nr_events)++;
1728
8237e045
JA
1729 if (refcount_dec_and_test(&req->refs) &&
1730 !io_req_multi_free(&rb, req))
1731 io_free_req(req);
def596e9 1732 }
def596e9 1733
09bb8394 1734 io_commit_cqring(ctx);
8237e045 1735 io_free_req_many(ctx, &rb);
def596e9
JA
1736}
1737
1738static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1739 long min)
1740{
1741 struct io_kiocb *req, *tmp;
1742 LIST_HEAD(done);
1743 bool spin;
1744 int ret;
1745
1746 /*
1747 * Only spin for completions if we don't have multiple devices hanging
1748 * off our complete list, and we're under the requested amount.
1749 */
1750 spin = !ctx->poll_multi_file && *nr_events < min;
1751
1752 ret = 0;
1753 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
9adbd45d 1754 struct kiocb *kiocb = &req->rw.kiocb;
def596e9
JA
1755
1756 /*
1757 * Move completed entries to our local list. If we find a
1758 * request that requires polling, break out and complete
1759 * the done list first, if we have entries there.
1760 */
1761 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1762 list_move_tail(&req->list, &done);
1763 continue;
1764 }
1765 if (!list_empty(&done))
1766 break;
1767
1768 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1769 if (ret < 0)
1770 break;
1771
1772 if (ret && spin)
1773 spin = false;
1774 ret = 0;
1775 }
1776
1777 if (!list_empty(&done))
1778 io_iopoll_complete(ctx, nr_events, &done);
1779
1780 return ret;
1781}
1782
1783/*
d195a66e 1784 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
def596e9
JA
1785 * non-spinning poll check - we'll still enter the driver poll loop, but only
1786 * as a non-spinning completion check.
1787 */
1788static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1789 long min)
1790{
08f5439f 1791 while (!list_empty(&ctx->poll_list) && !need_resched()) {
def596e9
JA
1792 int ret;
1793
1794 ret = io_do_iopoll(ctx, nr_events, min);
1795 if (ret < 0)
1796 return ret;
1797 if (!min || *nr_events >= min)
1798 return 0;
1799 }
1800
1801 return 1;
1802}
1803
1804/*
1805 * We can't just wait for polled events to come to us, we have to actively
1806 * find and complete them.
1807 */
1808static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1809{
1810 if (!(ctx->flags & IORING_SETUP_IOPOLL))
1811 return;
1812
1813 mutex_lock(&ctx->uring_lock);
1814 while (!list_empty(&ctx->poll_list)) {
1815 unsigned int nr_events = 0;
1816
1817 io_iopoll_getevents(ctx, &nr_events, 1);
08f5439f
JA
1818
1819 /*
1820 * Ensure we allow local-to-the-cpu processing to take place,
1821 * in this case we need to ensure that we reap all events.
1822 */
1823 cond_resched();
def596e9
JA
1824 }
1825 mutex_unlock(&ctx->uring_lock);
1826}
1827
c7849be9
XW
1828static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1829 long min)
def596e9 1830{
2b2ed975 1831 int iters = 0, ret = 0;
500f9fba 1832
c7849be9
XW
1833 /*
1834 * We disallow the app entering submit/complete with polling, but we
1835 * still need to lock the ring to prevent racing with polled issue
1836 * that got punted to a workqueue.
1837 */
1838 mutex_lock(&ctx->uring_lock);
def596e9
JA
1839 do {
1840 int tmin = 0;
1841
a3a0e43f
JA
1842 /*
1843 * Don't enter poll loop if we already have events pending.
1844 * If we do, we can potentially be spinning for commands that
1845 * already triggered a CQE (eg in error).
1846 */
1d7bb1d5 1847 if (io_cqring_events(ctx, false))
a3a0e43f
JA
1848 break;
1849
500f9fba
JA
1850 /*
1851 * If a submit got punted to a workqueue, we can have the
1852 * application entering polling for a command before it gets
1853 * issued. That app will hold the uring_lock for the duration
1854 * of the poll right here, so we need to take a breather every
1855 * now and then to ensure that the issue has a chance to add
1856 * the poll to the issued list. Otherwise we can spin here
1857 * forever, while the workqueue is stuck trying to acquire the
1858 * very same mutex.
1859 */
1860 if (!(++iters & 7)) {
1861 mutex_unlock(&ctx->uring_lock);
1862 mutex_lock(&ctx->uring_lock);
1863 }
1864
def596e9
JA
1865 if (*nr_events < min)
1866 tmin = min - *nr_events;
1867
1868 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1869 if (ret <= 0)
1870 break;
1871 ret = 0;
1872 } while (min && !*nr_events && !need_resched());
1873
500f9fba 1874 mutex_unlock(&ctx->uring_lock);
def596e9
JA
1875 return ret;
1876}
1877
491381ce 1878static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 1879{
491381ce
JA
1880 /*
1881 * Tell lockdep we inherited freeze protection from submission
1882 * thread.
1883 */
1884 if (req->flags & REQ_F_ISREG) {
1885 struct inode *inode = file_inode(req->file);
2b188cc1 1886
491381ce 1887 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2b188cc1 1888 }
491381ce 1889 file_end_write(req->file);
2b188cc1
JA
1890}
1891
4e88d6e7
JA
1892static inline void req_set_fail_links(struct io_kiocb *req)
1893{
1894 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1895 req->flags |= REQ_F_FAIL_LINK;
1896}
1897
ba816ad6 1898static void io_complete_rw_common(struct kiocb *kiocb, long res)
2b188cc1 1899{
9adbd45d 1900 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
bcda7baa 1901 int cflags = 0;
2b188cc1 1902
491381ce
JA
1903 if (kiocb->ki_flags & IOCB_WRITE)
1904 kiocb_end_write(req);
2b188cc1 1905
4e88d6e7
JA
1906 if (res != req->result)
1907 req_set_fail_links(req);
bcda7baa
JA
1908 if (req->flags & REQ_F_BUFFER_SELECTED)
1909 cflags = io_put_kbuf(req);
1910 __io_cqring_add_event(req, res, cflags);
ba816ad6
JA
1911}
1912
1913static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1914{
9adbd45d 1915 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6
JA
1916
1917 io_complete_rw_common(kiocb, res);
e65ef56d 1918 io_put_req(req);
2b188cc1
JA
1919}
1920
def596e9
JA
1921static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1922{
9adbd45d 1923 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 1924
491381ce
JA
1925 if (kiocb->ki_flags & IOCB_WRITE)
1926 kiocb_end_write(req);
def596e9 1927
4e88d6e7
JA
1928 if (res != req->result)
1929 req_set_fail_links(req);
9e645e11 1930 req->result = res;
def596e9
JA
1931 if (res != -EAGAIN)
1932 req->flags |= REQ_F_IOPOLL_COMPLETED;
1933}
1934
1935/*
1936 * After the iocb has been issued, it's safe to be found on the poll list.
1937 * Adding the kiocb to the list AFTER submission ensures that we don't
1938 * find it from a io_iopoll_getevents() thread before the issuer is done
1939 * accessing the kiocb cookie.
1940 */
1941static void io_iopoll_req_issued(struct io_kiocb *req)
1942{
1943 struct io_ring_ctx *ctx = req->ctx;
1944
1945 /*
1946 * Track whether we have multiple files in our lists. This will impact
1947 * how we do polling eventually, not spinning if we're on potentially
1948 * different devices.
1949 */
1950 if (list_empty(&ctx->poll_list)) {
1951 ctx->poll_multi_file = false;
1952 } else if (!ctx->poll_multi_file) {
1953 struct io_kiocb *list_req;
1954
1955 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1956 list);
9adbd45d 1957 if (list_req->file != req->file)
def596e9
JA
1958 ctx->poll_multi_file = true;
1959 }
1960
1961 /*
1962 * For fast devices, IO may have already completed. If it has, add
1963 * it to the front so we find it first.
1964 */
1965 if (req->flags & REQ_F_IOPOLL_COMPLETED)
1966 list_add(&req->list, &ctx->poll_list);
1967 else
1968 list_add_tail(&req->list, &ctx->poll_list);
bdcd3eab
XW
1969
1970 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
1971 wq_has_sleeper(&ctx->sqo_wait))
1972 wake_up(&ctx->sqo_wait);
def596e9
JA
1973}
1974
3d6770fb 1975static void io_file_put(struct io_submit_state *state)
9a56a232 1976{
3d6770fb 1977 if (state->file) {
9a56a232
JA
1978 int diff = state->has_refs - state->used_refs;
1979
1980 if (diff)
1981 fput_many(state->file, diff);
1982 state->file = NULL;
1983 }
1984}
1985
1986/*
1987 * Get as many references to a file as we have IOs left in this submission,
1988 * assuming most submissions are for one file, or at least that each file
1989 * has more than one submission.
1990 */
8da11c19 1991static struct file *__io_file_get(struct io_submit_state *state, int fd)
9a56a232
JA
1992{
1993 if (!state)
1994 return fget(fd);
1995
1996 if (state->file) {
1997 if (state->fd == fd) {
1998 state->used_refs++;
1999 state->ios_left--;
2000 return state->file;
2001 }
3d6770fb 2002 io_file_put(state);
9a56a232
JA
2003 }
2004 state->file = fget_many(fd, state->ios_left);
2005 if (!state->file)
2006 return NULL;
2007
2008 state->fd = fd;
2009 state->has_refs = state->ios_left;
2010 state->used_refs = 1;
2011 state->ios_left--;
2012 return state->file;
2013}
2014
2b188cc1
JA
2015/*
2016 * If we tracked the file through the SCM inflight mechanism, we could support
2017 * any file. For now, just ensure that anything potentially problematic is done
2018 * inline.
2019 */
2020static bool io_file_supports_async(struct file *file)
2021{
2022 umode_t mode = file_inode(file)->i_mode;
2023
10d59345 2024 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
2b188cc1
JA
2025 return true;
2026 if (S_ISREG(mode) && file->f_op != &io_uring_fops)
2027 return true;
2028
2029 return false;
2030}
2031
3529d8c2
JA
2032static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2033 bool force_nonblock)
2b188cc1 2034{
def596e9 2035 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 2036 struct kiocb *kiocb = &req->rw.kiocb;
09bb8394
JA
2037 unsigned ioprio;
2038 int ret;
2b188cc1 2039
491381ce
JA
2040 if (S_ISREG(file_inode(req->file)->i_mode))
2041 req->flags |= REQ_F_ISREG;
2042
2b188cc1 2043 kiocb->ki_pos = READ_ONCE(sqe->off);
ba04291e
JA
2044 if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2045 req->flags |= REQ_F_CUR_POS;
2046 kiocb->ki_pos = req->file->f_pos;
2047 }
2b188cc1 2048 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
3e577dcd
PB
2049 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2050 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2051 if (unlikely(ret))
2052 return ret;
2b188cc1
JA
2053
2054 ioprio = READ_ONCE(sqe->ioprio);
2055 if (ioprio) {
2056 ret = ioprio_check_cap(ioprio);
2057 if (ret)
09bb8394 2058 return ret;
2b188cc1
JA
2059
2060 kiocb->ki_ioprio = ioprio;
2061 } else
2062 kiocb->ki_ioprio = get_current_ioprio();
2063
8449eeda 2064 /* don't allow async punt if RWF_NOWAIT was requested */
491381ce
JA
2065 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2066 (req->file->f_flags & O_NONBLOCK))
8449eeda
SB
2067 req->flags |= REQ_F_NOWAIT;
2068
2069 if (force_nonblock)
2b188cc1 2070 kiocb->ki_flags |= IOCB_NOWAIT;
8449eeda 2071
def596e9 2072 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
2073 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2074 !kiocb->ki_filp->f_op->iopoll)
09bb8394 2075 return -EOPNOTSUPP;
2b188cc1 2076
def596e9
JA
2077 kiocb->ki_flags |= IOCB_HIPRI;
2078 kiocb->ki_complete = io_complete_rw_iopoll;
6873e0bd 2079 req->result = 0;
def596e9 2080 } else {
09bb8394
JA
2081 if (kiocb->ki_flags & IOCB_HIPRI)
2082 return -EINVAL;
def596e9
JA
2083 kiocb->ki_complete = io_complete_rw;
2084 }
9adbd45d 2085
3529d8c2
JA
2086 req->rw.addr = READ_ONCE(sqe->addr);
2087 req->rw.len = READ_ONCE(sqe->len);
bcda7baa 2088 /* we own ->private, reuse it for the buffer index / buffer ID */
9adbd45d 2089 req->rw.kiocb.private = (void *) (unsigned long)
3529d8c2 2090 READ_ONCE(sqe->buf_index);
2b188cc1 2091 return 0;
2b188cc1
JA
2092}
2093
2094static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2095{
2096 switch (ret) {
2097 case -EIOCBQUEUED:
2098 break;
2099 case -ERESTARTSYS:
2100 case -ERESTARTNOINTR:
2101 case -ERESTARTNOHAND:
2102 case -ERESTART_RESTARTBLOCK:
2103 /*
2104 * We can't just restart the syscall, since previously
2105 * submitted sqes may already be in progress. Just fail this
2106 * IO with EINTR.
2107 */
2108 ret = -EINTR;
2109 /* fall through */
2110 default:
2111 kiocb->ki_complete(kiocb, ret, 0);
2112 }
2113}
2114
014db007 2115static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
ba816ad6 2116{
ba04291e
JA
2117 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2118
2119 if (req->flags & REQ_F_CUR_POS)
2120 req->file->f_pos = kiocb->ki_pos;
bcaec089 2121 if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
014db007 2122 io_complete_rw(kiocb, ret, 0);
ba816ad6
JA
2123 else
2124 io_rw_done(kiocb, ret);
2125}
2126
9adbd45d 2127static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
7d009165 2128 struct iov_iter *iter)
edafccee 2129{
9adbd45d
JA
2130 struct io_ring_ctx *ctx = req->ctx;
2131 size_t len = req->rw.len;
edafccee
JA
2132 struct io_mapped_ubuf *imu;
2133 unsigned index, buf_index;
2134 size_t offset;
2135 u64 buf_addr;
2136
2137 /* attempt to use fixed buffers without having provided iovecs */
2138 if (unlikely(!ctx->user_bufs))
2139 return -EFAULT;
2140
9adbd45d 2141 buf_index = (unsigned long) req->rw.kiocb.private;
edafccee
JA
2142 if (unlikely(buf_index >= ctx->nr_user_bufs))
2143 return -EFAULT;
2144
2145 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2146 imu = &ctx->user_bufs[index];
9adbd45d 2147 buf_addr = req->rw.addr;
edafccee
JA
2148
2149 /* overflow */
2150 if (buf_addr + len < buf_addr)
2151 return -EFAULT;
2152 /* not inside the mapped region */
2153 if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2154 return -EFAULT;
2155
2156 /*
2157 * May not be a start of buffer, set size appropriately
2158 * and advance us to the beginning.
2159 */
2160 offset = buf_addr - imu->ubuf;
2161 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
2162
2163 if (offset) {
2164 /*
2165 * Don't use iov_iter_advance() here, as it's really slow for
2166 * using the latter parts of a big fixed buffer - it iterates
2167 * over each segment manually. We can cheat a bit here, because
2168 * we know that:
2169 *
2170 * 1) it's a BVEC iter, we set it up
2171 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2172 * first and last bvec
2173 *
2174 * So just find our index, and adjust the iterator afterwards.
2175 * If the offset is within the first bvec (or the whole first
2176 * bvec, just use iov_iter_advance(). This makes it easier
2177 * since we can just skip the first segment, which may not
2178 * be PAGE_SIZE aligned.
2179 */
2180 const struct bio_vec *bvec = imu->bvec;
2181
2182 if (offset <= bvec->bv_len) {
2183 iov_iter_advance(iter, offset);
2184 } else {
2185 unsigned long seg_skip;
2186
2187 /* skip first vec */
2188 offset -= bvec->bv_len;
2189 seg_skip = 1 + (offset >> PAGE_SHIFT);
2190
2191 iter->bvec = bvec + seg_skip;
2192 iter->nr_segs -= seg_skip;
99c79f66 2193 iter->count -= bvec->bv_len + offset;
bd11b3a3 2194 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
2195 }
2196 }
2197
5e559561 2198 return len;
edafccee
JA
2199}
2200
bcda7baa
JA
2201static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2202{
2203 if (needs_lock)
2204 mutex_unlock(&ctx->uring_lock);
2205}
2206
2207static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2208{
2209 /*
2210 * "Normal" inline submissions always hold the uring_lock, since we
2211 * grab it from the system call. Same is true for the SQPOLL offload.
2212 * The only exception is when we've detached the request and issue it
2213 * from an async worker thread, grab the lock for that case.
2214 */
2215 if (needs_lock)
2216 mutex_lock(&ctx->uring_lock);
2217}
2218
2219static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2220 int bgid, struct io_buffer *kbuf,
2221 bool needs_lock)
2222{
2223 struct io_buffer *head;
2224
2225 if (req->flags & REQ_F_BUFFER_SELECTED)
2226 return kbuf;
2227
2228 io_ring_submit_lock(req->ctx, needs_lock);
2229
2230 lockdep_assert_held(&req->ctx->uring_lock);
2231
2232 head = idr_find(&req->ctx->io_buffer_idr, bgid);
2233 if (head) {
2234 if (!list_empty(&head->list)) {
2235 kbuf = list_last_entry(&head->list, struct io_buffer,
2236 list);
2237 list_del(&kbuf->list);
2238 } else {
2239 kbuf = head;
2240 idr_remove(&req->ctx->io_buffer_idr, bgid);
2241 }
2242 if (*len > kbuf->len)
2243 *len = kbuf->len;
2244 } else {
2245 kbuf = ERR_PTR(-ENOBUFS);
2246 }
2247
2248 io_ring_submit_unlock(req->ctx, needs_lock);
2249
2250 return kbuf;
2251}
2252
4d954c25
JA
2253static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2254 bool needs_lock)
2255{
2256 struct io_buffer *kbuf;
2257 int bgid;
2258
2259 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2260 bgid = (int) (unsigned long) req->rw.kiocb.private;
2261 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2262 if (IS_ERR(kbuf))
2263 return kbuf;
2264 req->rw.addr = (u64) (unsigned long) kbuf;
2265 req->flags |= REQ_F_BUFFER_SELECTED;
2266 return u64_to_user_ptr(kbuf->addr);
2267}
2268
2269#ifdef CONFIG_COMPAT
2270static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2271 bool needs_lock)
2272{
2273 struct compat_iovec __user *uiov;
2274 compat_ssize_t clen;
2275 void __user *buf;
2276 ssize_t len;
2277
2278 uiov = u64_to_user_ptr(req->rw.addr);
2279 if (!access_ok(uiov, sizeof(*uiov)))
2280 return -EFAULT;
2281 if (__get_user(clen, &uiov->iov_len))
2282 return -EFAULT;
2283 if (clen < 0)
2284 return -EINVAL;
2285
2286 len = clen;
2287 buf = io_rw_buffer_select(req, &len, needs_lock);
2288 if (IS_ERR(buf))
2289 return PTR_ERR(buf);
2290 iov[0].iov_base = buf;
2291 iov[0].iov_len = (compat_size_t) len;
2292 return 0;
2293}
2294#endif
2295
2296static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2297 bool needs_lock)
2298{
2299 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2300 void __user *buf;
2301 ssize_t len;
2302
2303 if (copy_from_user(iov, uiov, sizeof(*uiov)))
2304 return -EFAULT;
2305
2306 len = iov[0].iov_len;
2307 if (len < 0)
2308 return -EINVAL;
2309 buf = io_rw_buffer_select(req, &len, needs_lock);
2310 if (IS_ERR(buf))
2311 return PTR_ERR(buf);
2312 iov[0].iov_base = buf;
2313 iov[0].iov_len = len;
2314 return 0;
2315}
2316
2317static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2318 bool needs_lock)
2319{
2320 if (req->flags & REQ_F_BUFFER_SELECTED)
2321 return 0;
2322 if (!req->rw.len)
2323 return 0;
2324 else if (req->rw.len > 1)
2325 return -EINVAL;
2326
2327#ifdef CONFIG_COMPAT
2328 if (req->ctx->compat)
2329 return io_compat_import(req, iov, needs_lock);
2330#endif
2331
2332 return __io_iov_buffer_select(req, iov, needs_lock);
2333}
2334
cf6fd4bd 2335static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
bcda7baa
JA
2336 struct iovec **iovec, struct iov_iter *iter,
2337 bool needs_lock)
2b188cc1 2338{
9adbd45d
JA
2339 void __user *buf = u64_to_user_ptr(req->rw.addr);
2340 size_t sqe_len = req->rw.len;
4d954c25 2341 ssize_t ret;
edafccee
JA
2342 u8 opcode;
2343
d625c6ee 2344 opcode = req->opcode;
7d009165 2345 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 2346 *iovec = NULL;
9adbd45d 2347 return io_import_fixed(req, rw, iter);
edafccee 2348 }
2b188cc1 2349
bcda7baa
JA
2350 /* buffer index only valid with fixed read/write, or buffer select */
2351 if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT))
9adbd45d
JA
2352 return -EINVAL;
2353
3a6820f2 2354 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
bcda7baa 2355 if (req->flags & REQ_F_BUFFER_SELECT) {
4d954c25
JA
2356 buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2357 if (IS_ERR(buf)) {
bcda7baa 2358 *iovec = NULL;
4d954c25 2359 return PTR_ERR(buf);
bcda7baa 2360 }
bcda7baa
JA
2361 }
2362
3a6820f2
JA
2363 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2364 *iovec = NULL;
3a901598 2365 return ret < 0 ? ret : sqe_len;
3a6820f2
JA
2366 }
2367
f67676d1
JA
2368 if (req->io) {
2369 struct io_async_rw *iorw = &req->io->rw;
2370
2371 *iovec = iorw->iov;
2372 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2373 if (iorw->iov == iorw->fast_iov)
2374 *iovec = NULL;
2375 return iorw->size;
2376 }
2377
4d954c25
JA
2378 if (req->flags & REQ_F_BUFFER_SELECT) {
2379 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2380 if (!ret)
2381 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
2382 *iovec = NULL;
2383 return ret;
2384 }
2385
2b188cc1 2386#ifdef CONFIG_COMPAT
cf6fd4bd 2387 if (req->ctx->compat)
2b188cc1
JA
2388 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2389 iovec, iter);
2390#endif
2391
2392 return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2393}
2394
31b51510 2395/*
32960613
JA
2396 * For files that don't have ->read_iter() and ->write_iter(), handle them
2397 * by looping over ->read() or ->write() manually.
31b51510 2398 */
32960613
JA
2399static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2400 struct iov_iter *iter)
2401{
2402 ssize_t ret = 0;
2403
2404 /*
2405 * Don't support polled IO through this interface, and we can't
2406 * support non-blocking either. For the latter, this just causes
2407 * the kiocb to be handled from an async context.
2408 */
2409 if (kiocb->ki_flags & IOCB_HIPRI)
2410 return -EOPNOTSUPP;
2411 if (kiocb->ki_flags & IOCB_NOWAIT)
2412 return -EAGAIN;
2413
2414 while (iov_iter_count(iter)) {
311ae9e1 2415 struct iovec iovec;
32960613
JA
2416 ssize_t nr;
2417
311ae9e1
PB
2418 if (!iov_iter_is_bvec(iter)) {
2419 iovec = iov_iter_iovec(iter);
2420 } else {
2421 /* fixed buffers import bvec */
2422 iovec.iov_base = kmap(iter->bvec->bv_page)
2423 + iter->iov_offset;
2424 iovec.iov_len = min(iter->count,
2425 iter->bvec->bv_len - iter->iov_offset);
2426 }
2427
32960613
JA
2428 if (rw == READ) {
2429 nr = file->f_op->read(file, iovec.iov_base,
2430 iovec.iov_len, &kiocb->ki_pos);
2431 } else {
2432 nr = file->f_op->write(file, iovec.iov_base,
2433 iovec.iov_len, &kiocb->ki_pos);
2434 }
2435
311ae9e1
PB
2436 if (iov_iter_is_bvec(iter))
2437 kunmap(iter->bvec->bv_page);
2438
32960613
JA
2439 if (nr < 0) {
2440 if (!ret)
2441 ret = nr;
2442 break;
2443 }
2444 ret += nr;
2445 if (nr != iovec.iov_len)
2446 break;
2447 iov_iter_advance(iter, nr);
2448 }
2449
2450 return ret;
2451}
2452
b7bb4f7d 2453static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
f67676d1
JA
2454 struct iovec *iovec, struct iovec *fast_iov,
2455 struct iov_iter *iter)
2456{
2457 req->io->rw.nr_segs = iter->nr_segs;
2458 req->io->rw.size = io_size;
2459 req->io->rw.iov = iovec;
2460 if (!req->io->rw.iov) {
2461 req->io->rw.iov = req->io->rw.fast_iov;
2462 memcpy(req->io->rw.iov, fast_iov,
2463 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
2464 } else {
2465 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
2466 }
2467}
2468
b7bb4f7d 2469static int io_alloc_async_ctx(struct io_kiocb *req)
f67676d1 2470{
d3656344
JA
2471 if (!io_op_defs[req->opcode].async_ctx)
2472 return 0;
f67676d1 2473 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
06b76d44 2474 return req->io == NULL;
b7bb4f7d
JA
2475}
2476
b7bb4f7d
JA
2477static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2478 struct iovec *iovec, struct iovec *fast_iov,
2479 struct iov_iter *iter)
2480{
980ad263 2481 if (!io_op_defs[req->opcode].async_ctx)
74566df3 2482 return 0;
5d204bcf
JA
2483 if (!req->io) {
2484 if (io_alloc_async_ctx(req))
2485 return -ENOMEM;
b7bb4f7d 2486
5d204bcf
JA
2487 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2488 }
b7bb4f7d 2489 return 0;
f67676d1
JA
2490}
2491
3529d8c2
JA
2492static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2493 bool force_nonblock)
f67676d1 2494{
3529d8c2
JA
2495 struct io_async_ctx *io;
2496 struct iov_iter iter;
f67676d1
JA
2497 ssize_t ret;
2498
3529d8c2
JA
2499 ret = io_prep_rw(req, sqe, force_nonblock);
2500 if (ret)
2501 return ret;
f67676d1 2502
3529d8c2
JA
2503 if (unlikely(!(req->file->f_mode & FMODE_READ)))
2504 return -EBADF;
f67676d1 2505
5f798bea
PB
2506 /* either don't need iovec imported or already have it */
2507 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
3529d8c2
JA
2508 return 0;
2509
2510 io = req->io;
2511 io->rw.iov = io->rw.fast_iov;
2512 req->io = NULL;
bcda7baa 2513 ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
3529d8c2
JA
2514 req->io = io;
2515 if (ret < 0)
2516 return ret;
2517
2518 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2519 return 0;
f67676d1
JA
2520}
2521
014db007 2522static int io_read(struct io_kiocb *req, bool force_nonblock)
2b188cc1
JA
2523{
2524 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2525 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2526 struct iov_iter iter;
31b51510 2527 size_t iov_count;
f67676d1 2528 ssize_t io_size, ret;
2b188cc1 2529
bcda7baa 2530 ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
06b76d44
JA
2531 if (ret < 0)
2532 return ret;
2b188cc1 2533
fd6c2e4c
JA
2534 /* Ensure we clear previously set non-block flag */
2535 if (!force_nonblock)
29de5f6a 2536 kiocb->ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2537
797f3f53 2538 req->result = 0;
f67676d1 2539 io_size = ret;
9e645e11 2540 if (req->flags & REQ_F_LINK)
f67676d1
JA
2541 req->result = io_size;
2542
2543 /*
2544 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2545 * we know to async punt it even if it was opened O_NONBLOCK
2546 */
29de5f6a 2547 if (force_nonblock && !io_file_supports_async(req->file))
f67676d1 2548 goto copy_iov;
9e645e11 2549
31b51510 2550 iov_count = iov_iter_count(&iter);
9adbd45d 2551 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2b188cc1
JA
2552 if (!ret) {
2553 ssize_t ret2;
2554
9adbd45d
JA
2555 if (req->file->f_op->read_iter)
2556 ret2 = call_read_iter(req->file, kiocb, &iter);
32960613 2557 else
9adbd45d 2558 ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
32960613 2559
9d93a3f5 2560 /* Catch -EAGAIN return for forced non-blocking submission */
f67676d1 2561 if (!force_nonblock || ret2 != -EAGAIN) {
014db007 2562 kiocb_done(kiocb, ret2);
f67676d1
JA
2563 } else {
2564copy_iov:
b7bb4f7d 2565 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2566 inline_vecs, &iter);
2567 if (ret)
2568 goto out_free;
29de5f6a
JA
2569 /* any defer here is final, must blocking retry */
2570 if (!(req->flags & REQ_F_NOWAIT))
2571 req->flags |= REQ_F_MUST_PUNT;
f67676d1
JA
2572 return -EAGAIN;
2573 }
2b188cc1 2574 }
f67676d1 2575out_free:
1e95081c 2576 kfree(iovec);
99bc4c38 2577 req->flags &= ~REQ_F_NEED_CLEANUP;
2b188cc1
JA
2578 return ret;
2579}
2580
3529d8c2
JA
2581static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2582 bool force_nonblock)
f67676d1 2583{
3529d8c2
JA
2584 struct io_async_ctx *io;
2585 struct iov_iter iter;
f67676d1
JA
2586 ssize_t ret;
2587
3529d8c2
JA
2588 ret = io_prep_rw(req, sqe, force_nonblock);
2589 if (ret)
2590 return ret;
f67676d1 2591
3529d8c2
JA
2592 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2593 return -EBADF;
f67676d1 2594
5f798bea
PB
2595 /* either don't need iovec imported or already have it */
2596 if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
3529d8c2
JA
2597 return 0;
2598
2599 io = req->io;
2600 io->rw.iov = io->rw.fast_iov;
2601 req->io = NULL;
bcda7baa 2602 ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
3529d8c2
JA
2603 req->io = io;
2604 if (ret < 0)
2605 return ret;
2606
2607 io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2608 return 0;
f67676d1
JA
2609}
2610
014db007 2611static int io_write(struct io_kiocb *req, bool force_nonblock)
2b188cc1
JA
2612{
2613 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 2614 struct kiocb *kiocb = &req->rw.kiocb;
2b188cc1 2615 struct iov_iter iter;
31b51510 2616 size_t iov_count;
f67676d1 2617 ssize_t ret, io_size;
2b188cc1 2618
bcda7baa 2619 ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
06b76d44
JA
2620 if (ret < 0)
2621 return ret;
2b188cc1 2622
fd6c2e4c
JA
2623 /* Ensure we clear previously set non-block flag */
2624 if (!force_nonblock)
9adbd45d 2625 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
fd6c2e4c 2626
797f3f53 2627 req->result = 0;
f67676d1 2628 io_size = ret;
9e645e11 2629 if (req->flags & REQ_F_LINK)
f67676d1 2630 req->result = io_size;
9e645e11 2631
f67676d1
JA
2632 /*
2633 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2634 * we know to async punt it even if it was opened O_NONBLOCK
2635 */
29de5f6a 2636 if (force_nonblock && !io_file_supports_async(req->file))
f67676d1 2637 goto copy_iov;
31b51510 2638
10d59345
JA
2639 /* file path doesn't support NOWAIT for non-direct_IO */
2640 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2641 (req->flags & REQ_F_ISREG))
f67676d1 2642 goto copy_iov;
31b51510 2643
f67676d1 2644 iov_count = iov_iter_count(&iter);
9adbd45d 2645 ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2b188cc1 2646 if (!ret) {
9bf7933f
RP
2647 ssize_t ret2;
2648
2b188cc1
JA
2649 /*
2650 * Open-code file_start_write here to grab freeze protection,
2651 * which will be released by another thread in
2652 * io_complete_rw(). Fool lockdep by telling it the lock got
2653 * released so that it doesn't complain about the held lock when
2654 * we return to userspace.
2655 */
491381ce 2656 if (req->flags & REQ_F_ISREG) {
9adbd45d 2657 __sb_start_write(file_inode(req->file)->i_sb,
2b188cc1 2658 SB_FREEZE_WRITE, true);
9adbd45d 2659 __sb_writers_release(file_inode(req->file)->i_sb,
2b188cc1
JA
2660 SB_FREEZE_WRITE);
2661 }
2662 kiocb->ki_flags |= IOCB_WRITE;
9bf7933f 2663
9adbd45d
JA
2664 if (req->file->f_op->write_iter)
2665 ret2 = call_write_iter(req->file, kiocb, &iter);
32960613 2666 else
9adbd45d 2667 ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
faac996c
JA
2668 /*
2669 * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just
2670 * retry them without IOCB_NOWAIT.
2671 */
2672 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
2673 ret2 = -EAGAIN;
f67676d1 2674 if (!force_nonblock || ret2 != -EAGAIN) {
014db007 2675 kiocb_done(kiocb, ret2);
f67676d1
JA
2676 } else {
2677copy_iov:
b7bb4f7d 2678 ret = io_setup_async_rw(req, io_size, iovec,
f67676d1
JA
2679 inline_vecs, &iter);
2680 if (ret)
2681 goto out_free;
29de5f6a
JA
2682 /* any defer here is final, must blocking retry */
2683 req->flags |= REQ_F_MUST_PUNT;
f67676d1
JA
2684 return -EAGAIN;
2685 }
2b188cc1 2686 }
31b51510 2687out_free:
99bc4c38 2688 req->flags &= ~REQ_F_NEED_CLEANUP;
1e95081c 2689 kfree(iovec);
2b188cc1
JA
2690 return ret;
2691}
2692
7d67af2c
PB
2693static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2694{
2695 struct io_splice* sp = &req->splice;
2696 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
2697 int ret;
2698
2699 if (req->flags & REQ_F_NEED_CLEANUP)
2700 return 0;
2701
2702 sp->file_in = NULL;
2703 sp->off_in = READ_ONCE(sqe->splice_off_in);
2704 sp->off_out = READ_ONCE(sqe->off);
2705 sp->len = READ_ONCE(sqe->len);
2706 sp->flags = READ_ONCE(sqe->splice_flags);
2707
2708 if (unlikely(sp->flags & ~valid_flags))
2709 return -EINVAL;
2710
2711 ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
2712 (sp->flags & SPLICE_F_FD_IN_FIXED));
2713 if (ret)
2714 return ret;
2715 req->flags |= REQ_F_NEED_CLEANUP;
2716
2717 if (!S_ISREG(file_inode(sp->file_in)->i_mode))
2718 req->work.flags |= IO_WQ_WORK_UNBOUND;
2719
2720 return 0;
2721}
2722
2723static bool io_splice_punt(struct file *file)
2724{
2725 if (get_pipe_info(file))
2726 return false;
2727 if (!io_file_supports_async(file))
2728 return true;
2729 return !(file->f_mode & O_NONBLOCK);
2730}
2731
014db007 2732static int io_splice(struct io_kiocb *req, bool force_nonblock)
7d67af2c
PB
2733{
2734 struct io_splice *sp = &req->splice;
2735 struct file *in = sp->file_in;
2736 struct file *out = sp->file_out;
2737 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
2738 loff_t *poff_in, *poff_out;
2739 long ret;
2740
2741 if (force_nonblock) {
2742 if (io_splice_punt(in) || io_splice_punt(out))
2743 return -EAGAIN;
2744 flags |= SPLICE_F_NONBLOCK;
2745 }
2746
2747 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
2748 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
2749 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
2750 if (force_nonblock && ret == -EAGAIN)
2751 return -EAGAIN;
2752
2753 io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
2754 req->flags &= ~REQ_F_NEED_CLEANUP;
2755
2756 io_cqring_add_event(req, ret);
2757 if (ret != sp->len)
2758 req_set_fail_links(req);
014db007 2759 io_put_req(req);
7d67af2c
PB
2760 return 0;
2761}
2762
2b188cc1
JA
2763/*
2764 * IORING_OP_NOP just posts a completion event, nothing else.
2765 */
78e19bbe 2766static int io_nop(struct io_kiocb *req)
2b188cc1
JA
2767{
2768 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 2769
def596e9
JA
2770 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2771 return -EINVAL;
2772
78e19bbe 2773 io_cqring_add_event(req, 0);
e65ef56d 2774 io_put_req(req);
2b188cc1
JA
2775 return 0;
2776}
2777
3529d8c2 2778static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 2779{
6b06314c 2780 struct io_ring_ctx *ctx = req->ctx;
c992fe29 2781
09bb8394
JA
2782 if (!req->file)
2783 return -EBADF;
c992fe29 2784
6b06314c 2785 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 2786 return -EINVAL;
edafccee 2787 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
c992fe29
CH
2788 return -EINVAL;
2789
8ed8d3c3
JA
2790 req->sync.flags = READ_ONCE(sqe->fsync_flags);
2791 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2792 return -EINVAL;
2793
2794 req->sync.off = READ_ONCE(sqe->off);
2795 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
2796 return 0;
2797}
2798
8ed8d3c3
JA
2799static bool io_req_cancelled(struct io_kiocb *req)
2800{
2801 if (req->work.flags & IO_WQ_WORK_CANCEL) {
2802 req_set_fail_links(req);
2803 io_cqring_add_event(req, -ECANCELED);
e9fd9396 2804 io_put_req(req);
8ed8d3c3
JA
2805 return true;
2806 }
2807
2808 return false;
2809}
2810
014db007 2811static void __io_fsync(struct io_kiocb *req)
8ed8d3c3 2812{
8ed8d3c3 2813 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
2814 int ret;
2815
9adbd45d 2816 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
2817 end > 0 ? end : LLONG_MAX,
2818 req->sync.flags & IORING_FSYNC_DATASYNC);
2819 if (ret < 0)
2820 req_set_fail_links(req);
2821 io_cqring_add_event(req, ret);
014db007 2822 io_put_req(req);
5ea62161
PB
2823}
2824
2825static void io_fsync_finish(struct io_wq_work **workptr)
2826{
2827 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
5ea62161
PB
2828
2829 if (io_req_cancelled(req))
2830 return;
014db007 2831 __io_fsync(req);
e9fd9396 2832 io_steal_work(req, workptr);
8ed8d3c3
JA
2833}
2834
014db007 2835static int io_fsync(struct io_kiocb *req, bool force_nonblock)
c992fe29 2836{
c992fe29 2837 /* fsync always requires a blocking context */
8ed8d3c3 2838 if (force_nonblock) {
8ed8d3c3 2839 req->work.func = io_fsync_finish;
c992fe29 2840 return -EAGAIN;
8ed8d3c3 2841 }
014db007 2842 __io_fsync(req);
c992fe29
CH
2843 return 0;
2844}
2845
014db007 2846static void __io_fallocate(struct io_kiocb *req)
8ed8d3c3 2847{
8ed8d3c3
JA
2848 int ret;
2849
d63d1b5e
JA
2850 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2851 req->sync.len);
8ed8d3c3
JA
2852 if (ret < 0)
2853 req_set_fail_links(req);
2854 io_cqring_add_event(req, ret);
014db007 2855 io_put_req(req);
5ea62161
PB
2856}
2857
2858static void io_fallocate_finish(struct io_wq_work **workptr)
2859{
2860 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
5ea62161 2861
594506fe
PB
2862 if (io_req_cancelled(req))
2863 return;
014db007 2864 __io_fallocate(req);
e9fd9396 2865 io_steal_work(req, workptr);
5d17b4a4
JA
2866}
2867
d63d1b5e
JA
2868static int io_fallocate_prep(struct io_kiocb *req,
2869 const struct io_uring_sqe *sqe)
2870{
2871 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2872 return -EINVAL;
2873
2874 req->sync.off = READ_ONCE(sqe->off);
2875 req->sync.len = READ_ONCE(sqe->addr);
2876 req->sync.mode = READ_ONCE(sqe->len);
2877 return 0;
2878}
2879
014db007 2880static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
5d17b4a4 2881{
d63d1b5e 2882 /* fallocate always requiring blocking context */
8ed8d3c3 2883 if (force_nonblock) {
d63d1b5e 2884 req->work.func = io_fallocate_finish;
5d17b4a4 2885 return -EAGAIN;
8ed8d3c3 2886 }
5d17b4a4 2887
014db007 2888 __io_fallocate(req);
5d17b4a4
JA
2889 return 0;
2890}
2891
15b71abe 2892static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 2893{
f8748881 2894 const char __user *fname;
15b71abe 2895 int ret;
b7bb4f7d 2896
15b71abe
JA
2897 if (sqe->ioprio || sqe->buf_index)
2898 return -EINVAL;
cf3040ca
JA
2899 if (sqe->flags & IOSQE_FIXED_FILE)
2900 return -EBADF;
0bdbdd08
PB
2901 if (req->flags & REQ_F_NEED_CLEANUP)
2902 return 0;
03b1230c 2903
15b71abe 2904 req->open.dfd = READ_ONCE(sqe->fd);
c12cedf2 2905 req->open.how.mode = READ_ONCE(sqe->len);
f8748881 2906 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
c12cedf2 2907 req->open.how.flags = READ_ONCE(sqe->open_flags);
3529d8c2 2908
f8748881 2909 req->open.filename = getname(fname);
15b71abe
JA
2910 if (IS_ERR(req->open.filename)) {
2911 ret = PTR_ERR(req->open.filename);
2912 req->open.filename = NULL;
2913 return ret;
2914 }
3529d8c2 2915
8fef80bf 2916 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 2917 return 0;
03b1230c
JA
2918}
2919
cebdb986 2920static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 2921{
cebdb986
JA
2922 struct open_how __user *how;
2923 const char __user *fname;
2924 size_t len;
0fa03c62
JA
2925 int ret;
2926
cebdb986 2927 if (sqe->ioprio || sqe->buf_index)
0fa03c62 2928 return -EINVAL;
cf3040ca
JA
2929 if (sqe->flags & IOSQE_FIXED_FILE)
2930 return -EBADF;
0bdbdd08
PB
2931 if (req->flags & REQ_F_NEED_CLEANUP)
2932 return 0;
0fa03c62 2933
cebdb986
JA
2934 req->open.dfd = READ_ONCE(sqe->fd);
2935 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2936 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2937 len = READ_ONCE(sqe->len);
0fa03c62 2938
cebdb986
JA
2939 if (len < OPEN_HOW_SIZE_VER0)
2940 return -EINVAL;
3529d8c2 2941
cebdb986
JA
2942 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
2943 len);
2944 if (ret)
2945 return ret;
3529d8c2 2946
cebdb986
JA
2947 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
2948 req->open.how.flags |= O_LARGEFILE;
0fa03c62 2949
cebdb986
JA
2950 req->open.filename = getname(fname);
2951 if (IS_ERR(req->open.filename)) {
2952 ret = PTR_ERR(req->open.filename);
2953 req->open.filename = NULL;
2954 return ret;
2955 }
2956
8fef80bf 2957 req->flags |= REQ_F_NEED_CLEANUP;
cebdb986
JA
2958 return 0;
2959}
2960
014db007 2961static int io_openat2(struct io_kiocb *req, bool force_nonblock)
15b71abe
JA
2962{
2963 struct open_flags op;
15b71abe
JA
2964 struct file *file;
2965 int ret;
2966
f86cd20c 2967 if (force_nonblock)
15b71abe 2968 return -EAGAIN;
15b71abe 2969
cebdb986 2970 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
2971 if (ret)
2972 goto err;
2973
cebdb986 2974 ret = get_unused_fd_flags(req->open.how.flags);
15b71abe
JA
2975 if (ret < 0)
2976 goto err;
2977
2978 file = do_filp_open(req->open.dfd, req->open.filename, &op);
2979 if (IS_ERR(file)) {
2980 put_unused_fd(ret);
2981 ret = PTR_ERR(file);
2982 } else {
2983 fsnotify_open(file);
2984 fd_install(ret, file);
2985 }
2986err:
2987 putname(req->open.filename);
8fef80bf 2988 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe
JA
2989 if (ret < 0)
2990 req_set_fail_links(req);
2991 io_cqring_add_event(req, ret);
014db007 2992 io_put_req(req);
15b71abe
JA
2993 return 0;
2994}
2995
014db007 2996static int io_openat(struct io_kiocb *req, bool force_nonblock)
cebdb986
JA
2997{
2998 req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
014db007 2999 return io_openat2(req, force_nonblock);
cebdb986
JA
3000}
3001
067524e9
JA
3002static int io_remove_buffers_prep(struct io_kiocb *req,
3003 const struct io_uring_sqe *sqe)
3004{
3005 struct io_provide_buf *p = &req->pbuf;
3006 u64 tmp;
3007
3008 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3009 return -EINVAL;
3010
3011 tmp = READ_ONCE(sqe->fd);
3012 if (!tmp || tmp > USHRT_MAX)
3013 return -EINVAL;
3014
3015 memset(p, 0, sizeof(*p));
3016 p->nbufs = tmp;
3017 p->bgid = READ_ONCE(sqe->buf_group);
3018 return 0;
3019}
3020
3021static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3022 int bgid, unsigned nbufs)
3023{
3024 unsigned i = 0;
3025
3026 /* shouldn't happen */
3027 if (!nbufs)
3028 return 0;
3029
3030 /* the head kbuf is the list itself */
3031 while (!list_empty(&buf->list)) {
3032 struct io_buffer *nxt;
3033
3034 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3035 list_del(&nxt->list);
3036 kfree(nxt);
3037 if (++i == nbufs)
3038 return i;
3039 }
3040 i++;
3041 kfree(buf);
3042 idr_remove(&ctx->io_buffer_idr, bgid);
3043
3044 return i;
3045}
3046
3047static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
3048{
3049 struct io_provide_buf *p = &req->pbuf;
3050 struct io_ring_ctx *ctx = req->ctx;
3051 struct io_buffer *head;
3052 int ret = 0;
3053
3054 io_ring_submit_lock(ctx, !force_nonblock);
3055
3056 lockdep_assert_held(&ctx->uring_lock);
3057
3058 ret = -ENOENT;
3059 head = idr_find(&ctx->io_buffer_idr, p->bgid);
3060 if (head)
3061 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3062
3063 io_ring_submit_lock(ctx, !force_nonblock);
3064 if (ret < 0)
3065 req_set_fail_links(req);
3066 io_cqring_add_event(req, ret);
3067 io_put_req(req);
3068 return 0;
3069}
3070
ddf0322d
JA
3071static int io_provide_buffers_prep(struct io_kiocb *req,
3072 const struct io_uring_sqe *sqe)
3073{
3074 struct io_provide_buf *p = &req->pbuf;
3075 u64 tmp;
3076
3077 if (sqe->ioprio || sqe->rw_flags)
3078 return -EINVAL;
3079
3080 tmp = READ_ONCE(sqe->fd);
3081 if (!tmp || tmp > USHRT_MAX)
3082 return -E2BIG;
3083 p->nbufs = tmp;
3084 p->addr = READ_ONCE(sqe->addr);
3085 p->len = READ_ONCE(sqe->len);
3086
3087 if (!access_ok(u64_to_user_ptr(p->addr), p->len))
3088 return -EFAULT;
3089
3090 p->bgid = READ_ONCE(sqe->buf_group);
3091 tmp = READ_ONCE(sqe->off);
3092 if (tmp > USHRT_MAX)
3093 return -E2BIG;
3094 p->bid = tmp;
3095 return 0;
3096}
3097
3098static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3099{
3100 struct io_buffer *buf;
3101 u64 addr = pbuf->addr;
3102 int i, bid = pbuf->bid;
3103
3104 for (i = 0; i < pbuf->nbufs; i++) {
3105 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3106 if (!buf)
3107 break;
3108
3109 buf->addr = addr;
3110 buf->len = pbuf->len;
3111 buf->bid = bid;
3112 addr += pbuf->len;
3113 bid++;
3114 if (!*head) {
3115 INIT_LIST_HEAD(&buf->list);
3116 *head = buf;
3117 } else {
3118 list_add_tail(&buf->list, &(*head)->list);
3119 }
3120 }
3121
3122 return i ? i : -ENOMEM;
3123}
3124
ddf0322d
JA
3125static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
3126{
3127 struct io_provide_buf *p = &req->pbuf;
3128 struct io_ring_ctx *ctx = req->ctx;
3129 struct io_buffer *head, *list;
3130 int ret = 0;
3131
3132 io_ring_submit_lock(ctx, !force_nonblock);
3133
3134 lockdep_assert_held(&ctx->uring_lock);
3135
3136 list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3137
3138 ret = io_add_buffers(p, &head);
3139 if (ret < 0)
3140 goto out;
3141
3142 if (!list) {
3143 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3144 GFP_KERNEL);
3145 if (ret < 0) {
067524e9 3146 __io_remove_buffers(ctx, head, p->bgid, -1U);
ddf0322d
JA
3147 goto out;
3148 }
3149 }
3150out:
3151 io_ring_submit_unlock(ctx, !force_nonblock);
3152 if (ret < 0)
3153 req_set_fail_links(req);
3154 io_cqring_add_event(req, ret);
3155 io_put_req(req);
3156 return 0;
3157}
3158
3e4827b0
JA
3159static int io_epoll_ctl_prep(struct io_kiocb *req,
3160 const struct io_uring_sqe *sqe)
3161{
3162#if defined(CONFIG_EPOLL)
3163 if (sqe->ioprio || sqe->buf_index)
3164 return -EINVAL;
3165
3166 req->epoll.epfd = READ_ONCE(sqe->fd);
3167 req->epoll.op = READ_ONCE(sqe->len);
3168 req->epoll.fd = READ_ONCE(sqe->off);
3169
3170 if (ep_op_has_event(req->epoll.op)) {
3171 struct epoll_event __user *ev;
3172
3173 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
3174 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
3175 return -EFAULT;
3176 }
3177
3178 return 0;
3179#else
3180 return -EOPNOTSUPP;
3181#endif
3182}
3183
014db007 3184static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
3e4827b0
JA
3185{
3186#if defined(CONFIG_EPOLL)
3187 struct io_epoll *ie = &req->epoll;
3188 int ret;
3189
3190 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
3191 if (force_nonblock && ret == -EAGAIN)
3192 return -EAGAIN;
3193
3194 if (ret < 0)
3195 req_set_fail_links(req);
3196 io_cqring_add_event(req, ret);
014db007 3197 io_put_req(req);
3e4827b0
JA
3198 return 0;
3199#else
3200 return -EOPNOTSUPP;
3201#endif
3202}
3203
c1ca757b
JA
3204static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3205{
3206#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3207 if (sqe->ioprio || sqe->buf_index || sqe->off)
3208 return -EINVAL;
3209
3210 req->madvise.addr = READ_ONCE(sqe->addr);
3211 req->madvise.len = READ_ONCE(sqe->len);
3212 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
3213 return 0;
3214#else
3215 return -EOPNOTSUPP;
3216#endif
3217}
3218
014db007 3219static int io_madvise(struct io_kiocb *req, bool force_nonblock)
c1ca757b
JA
3220{
3221#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3222 struct io_madvise *ma = &req->madvise;
3223 int ret;
3224
3225 if (force_nonblock)
3226 return -EAGAIN;
3227
3228 ret = do_madvise(ma->addr, ma->len, ma->advice);
3229 if (ret < 0)
3230 req_set_fail_links(req);
3231 io_cqring_add_event(req, ret);
014db007 3232 io_put_req(req);
c1ca757b
JA
3233 return 0;
3234#else
3235 return -EOPNOTSUPP;
3236#endif
3237}
3238
4840e418
JA
3239static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3240{
3241 if (sqe->ioprio || sqe->buf_index || sqe->addr)
3242 return -EINVAL;
3243
3244 req->fadvise.offset = READ_ONCE(sqe->off);
3245 req->fadvise.len = READ_ONCE(sqe->len);
3246 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3247 return 0;
3248}
3249
014db007 3250static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
4840e418
JA
3251{
3252 struct io_fadvise *fa = &req->fadvise;
3253 int ret;
3254
3e69426d
JA
3255 if (force_nonblock) {
3256 switch (fa->advice) {
3257 case POSIX_FADV_NORMAL:
3258 case POSIX_FADV_RANDOM:
3259 case POSIX_FADV_SEQUENTIAL:
3260 break;
3261 default:
3262 return -EAGAIN;
3263 }
3264 }
4840e418
JA
3265
3266 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3267 if (ret < 0)
3268 req_set_fail_links(req);
3269 io_cqring_add_event(req, ret);
014db007 3270 io_put_req(req);
4840e418
JA
3271 return 0;
3272}
3273
eddc7ef5
JA
3274static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3275{
f8748881 3276 const char __user *fname;
eddc7ef5
JA
3277 unsigned lookup_flags;
3278 int ret;
3279
3280 if (sqe->ioprio || sqe->buf_index)
3281 return -EINVAL;
cf3040ca
JA
3282 if (sqe->flags & IOSQE_FIXED_FILE)
3283 return -EBADF;
0bdbdd08
PB
3284 if (req->flags & REQ_F_NEED_CLEANUP)
3285 return 0;
eddc7ef5
JA
3286
3287 req->open.dfd = READ_ONCE(sqe->fd);
3288 req->open.mask = READ_ONCE(sqe->len);
f8748881 3289 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
eddc7ef5 3290 req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
c12cedf2 3291 req->open.how.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5 3292
c12cedf2 3293 if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
eddc7ef5
JA
3294 return -EINVAL;
3295
f8748881 3296 req->open.filename = getname_flags(fname, lookup_flags, NULL);
eddc7ef5
JA
3297 if (IS_ERR(req->open.filename)) {
3298 ret = PTR_ERR(req->open.filename);
3299 req->open.filename = NULL;
3300 return ret;
3301 }
3302
8fef80bf 3303 req->flags |= REQ_F_NEED_CLEANUP;
eddc7ef5
JA
3304 return 0;
3305}
3306
014db007 3307static int io_statx(struct io_kiocb *req, bool force_nonblock)
eddc7ef5
JA
3308{
3309 struct io_open *ctx = &req->open;
3310 unsigned lookup_flags;
3311 struct path path;
3312 struct kstat stat;
3313 int ret;
3314
3315 if (force_nonblock)
3316 return -EAGAIN;
3317
c12cedf2 3318 if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
eddc7ef5
JA
3319 return -EINVAL;
3320
3321retry:
3322 /* filename_lookup() drops it, keep a reference */
3323 ctx->filename->refcnt++;
3324
3325 ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
3326 NULL);
3327 if (ret)
3328 goto err;
3329
c12cedf2 3330 ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
eddc7ef5
JA
3331 path_put(&path);
3332 if (retry_estale(ret, lookup_flags)) {
3333 lookup_flags |= LOOKUP_REVAL;
3334 goto retry;
3335 }
3336 if (!ret)
3337 ret = cp_statx(&stat, ctx->buffer);
3338err:
3339 putname(ctx->filename);
8fef80bf 3340 req->flags &= ~REQ_F_NEED_CLEANUP;
eddc7ef5
JA
3341 if (ret < 0)
3342 req_set_fail_links(req);
3343 io_cqring_add_event(req, ret);
014db007 3344 io_put_req(req);
eddc7ef5
JA
3345 return 0;
3346}
3347
b5dba59e
JA
3348static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3349{
3350 /*
3351 * If we queue this for async, it must not be cancellable. That would
3352 * leave the 'file' in an undeterminate state.
3353 */
3354 req->work.flags |= IO_WQ_WORK_NO_CANCEL;
3355
3356 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
3357 sqe->rw_flags || sqe->buf_index)
3358 return -EINVAL;
3359 if (sqe->flags & IOSQE_FIXED_FILE)
cf3040ca 3360 return -EBADF;
b5dba59e
JA
3361
3362 req->close.fd = READ_ONCE(sqe->fd);
3363 if (req->file->f_op == &io_uring_fops ||
b14cca0c 3364 req->close.fd == req->ctx->ring_fd)
b5dba59e
JA
3365 return -EBADF;
3366
3367 return 0;
3368}
3369
a93b3331 3370/* only called when __close_fd_get_file() is done */
014db007 3371static void __io_close_finish(struct io_kiocb *req)
a93b3331
PB
3372{
3373 int ret;
3374
3375 ret = filp_close(req->close.put_file, req->work.files);
3376 if (ret < 0)
3377 req_set_fail_links(req);
3378 io_cqring_add_event(req, ret);
3379 fput(req->close.put_file);
014db007 3380 io_put_req(req);
a93b3331
PB
3381}
3382
b5dba59e
JA
3383static void io_close_finish(struct io_wq_work **workptr)
3384{
3385 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
b5dba59e 3386
7fbeb95d 3387 /* not cancellable, don't do io_req_cancelled() */
014db007 3388 __io_close_finish(req);
e9fd9396 3389 io_steal_work(req, workptr);
b5dba59e
JA
3390}
3391
014db007 3392static int io_close(struct io_kiocb *req, bool force_nonblock)
b5dba59e
JA
3393{
3394 int ret;
3395
3396 req->close.put_file = NULL;
3397 ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
3398 if (ret < 0)
3399 return ret;
3400
3401 /* if the file has a flush method, be safe and punt to async */
a2100672 3402 if (req->close.put_file->f_op->flush && force_nonblock) {
594506fe
PB
3403 /* submission ref will be dropped, take it for async */
3404 refcount_inc(&req->refs);
3405
a2100672
PB
3406 req->work.func = io_close_finish;
3407 /*
3408 * Do manual async queue here to avoid grabbing files - we don't
3409 * need the files, and it'll cause io_close_finish() to close
3410 * the file again and cause a double CQE entry for this request
3411 */
3412 io_queue_async_work(req);
3413 return 0;
3414 }
b5dba59e
JA
3415
3416 /*
3417 * No ->flush(), safely close from here and just punt the
3418 * fput() to async context.
3419 */
014db007 3420 __io_close_finish(req);
a93b3331 3421 return 0;
b5dba59e
JA
3422}
3423
3529d8c2 3424static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
3425{
3426 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4
JA
3427
3428 if (!req->file)
3429 return -EBADF;
5d17b4a4
JA
3430
3431 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3432 return -EINVAL;
3433 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3434 return -EINVAL;
3435
8ed8d3c3
JA
3436 req->sync.off = READ_ONCE(sqe->off);
3437 req->sync.len = READ_ONCE(sqe->len);
3438 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
3439 return 0;
3440}
3441
014db007 3442static void __io_sync_file_range(struct io_kiocb *req)
8ed8d3c3 3443{
8ed8d3c3
JA
3444 int ret;
3445
9adbd45d 3446 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
3447 req->sync.flags);
3448 if (ret < 0)
3449 req_set_fail_links(req);
3450 io_cqring_add_event(req, ret);
014db007 3451 io_put_req(req);
5ea62161
PB
3452}
3453
3454
3455static void io_sync_file_range_finish(struct io_wq_work **workptr)
3456{
3457 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3458 struct io_kiocb *nxt = NULL;
3459
3460 if (io_req_cancelled(req))
3461 return;
014db007 3462 __io_sync_file_range(req);
594506fe 3463 io_put_req(req); /* put submission ref */
8ed8d3c3 3464 if (nxt)
78912934 3465 io_wq_assign_next(workptr, nxt);
5d17b4a4
JA
3466}
3467
014db007 3468static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
5d17b4a4 3469{
5d17b4a4 3470 /* sync_file_range always requires a blocking context */
8ed8d3c3 3471 if (force_nonblock) {
8ed8d3c3 3472 req->work.func = io_sync_file_range_finish;
5d17b4a4 3473 return -EAGAIN;
8ed8d3c3 3474 }
5d17b4a4 3475
014db007 3476 __io_sync_file_range(req);
5d17b4a4
JA
3477 return 0;
3478}
3479
469956e8 3480#if defined(CONFIG_NET)
02d27d89
PB
3481static int io_setup_async_msg(struct io_kiocb *req,
3482 struct io_async_msghdr *kmsg)
3483{
3484 if (req->io)
3485 return -EAGAIN;
3486 if (io_alloc_async_ctx(req)) {
3487 if (kmsg->iov != kmsg->fast_iov)
3488 kfree(kmsg->iov);
3489 return -ENOMEM;
3490 }
3491 req->flags |= REQ_F_NEED_CLEANUP;
3492 memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
3493 return -EAGAIN;
3494}
3495
3529d8c2 3496static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 3497{
e47293fd 3498 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2 3499 struct io_async_ctx *io = req->io;
99bc4c38 3500 int ret;
03b1230c 3501
e47293fd
JA
3502 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3503 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 3504 sr->len = READ_ONCE(sqe->len);
3529d8c2 3505
d8768362
JA
3506#ifdef CONFIG_COMPAT
3507 if (req->ctx->compat)
3508 sr->msg_flags |= MSG_CMSG_COMPAT;
3509#endif
3510
fddaface 3511 if (!io || req->opcode == IORING_OP_SEND)
3529d8c2 3512 return 0;
5f798bea
PB
3513 /* iovec is already imported */
3514 if (req->flags & REQ_F_NEED_CLEANUP)
3515 return 0;
3529d8c2 3516
d9688565 3517 io->msg.iov = io->msg.fast_iov;
99bc4c38 3518 ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
e47293fd 3519 &io->msg.iov);
99bc4c38
PB
3520 if (!ret)
3521 req->flags |= REQ_F_NEED_CLEANUP;
3522 return ret;
03b1230c
JA
3523}
3524
014db007 3525static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
aa1fa28f 3526{
0b416c3e 3527 struct io_async_msghdr *kmsg = NULL;
0fa03c62
JA
3528 struct socket *sock;
3529 int ret;
3530
3531 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3532 return -EINVAL;
3533
3534 sock = sock_from_file(req->file, &ret);
3535 if (sock) {
b7bb4f7d 3536 struct io_async_ctx io;
0fa03c62
JA
3537 unsigned flags;
3538
03b1230c 3539 if (req->io) {
0b416c3e 3540 kmsg = &req->io->msg;
b537916c 3541 kmsg->msg.msg_name = &req->io->msg.addr;
0b416c3e
JA
3542 /* if iov is set, it's allocated already */
3543 if (!kmsg->iov)
3544 kmsg->iov = kmsg->fast_iov;
3545 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 3546 } else {
3529d8c2
JA
3547 struct io_sr_msg *sr = &req->sr_msg;
3548
0b416c3e 3549 kmsg = &io.msg;
b537916c 3550 kmsg->msg.msg_name = &io.msg.addr;
3529d8c2
JA
3551
3552 io.msg.iov = io.msg.fast_iov;
3553 ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3554 sr->msg_flags, &io.msg.iov);
03b1230c 3555 if (ret)
3529d8c2 3556 return ret;
03b1230c 3557 }
0fa03c62 3558
e47293fd
JA
3559 flags = req->sr_msg.msg_flags;
3560 if (flags & MSG_DONTWAIT)
3561 req->flags |= REQ_F_NOWAIT;
3562 else if (force_nonblock)
3563 flags |= MSG_DONTWAIT;
3564
0b416c3e 3565 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
02d27d89
PB
3566 if (force_nonblock && ret == -EAGAIN)
3567 return io_setup_async_msg(req, kmsg);
441cdbd5
JA
3568 if (ret == -ERESTARTSYS)
3569 ret = -EINTR;
0fa03c62
JA
3570 }
3571
1e95081c 3572 if (kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 3573 kfree(kmsg->iov);
99bc4c38 3574 req->flags &= ~REQ_F_NEED_CLEANUP;
78e19bbe 3575 io_cqring_add_event(req, ret);
4e88d6e7
JA
3576 if (ret < 0)
3577 req_set_fail_links(req);
014db007 3578 io_put_req(req);
5d17b4a4 3579 return 0;
03b1230c 3580}
aa1fa28f 3581
014db007 3582static int io_send(struct io_kiocb *req, bool force_nonblock)
fddaface 3583{
fddaface
JA
3584 struct socket *sock;
3585 int ret;
3586
3587 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3588 return -EINVAL;
3589
3590 sock = sock_from_file(req->file, &ret);
3591 if (sock) {
3592 struct io_sr_msg *sr = &req->sr_msg;
3593 struct msghdr msg;
3594 struct iovec iov;
3595 unsigned flags;
3596
3597 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3598 &msg.msg_iter);
3599 if (ret)
3600 return ret;
3601
3602 msg.msg_name = NULL;
3603 msg.msg_control = NULL;
3604 msg.msg_controllen = 0;
3605 msg.msg_namelen = 0;
3606
3607 flags = req->sr_msg.msg_flags;
3608 if (flags & MSG_DONTWAIT)
3609 req->flags |= REQ_F_NOWAIT;
3610 else if (force_nonblock)
3611 flags |= MSG_DONTWAIT;
3612
0b7b21e4
JA
3613 msg.msg_flags = flags;
3614 ret = sock_sendmsg(sock, &msg);
fddaface
JA
3615 if (force_nonblock && ret == -EAGAIN)
3616 return -EAGAIN;
3617 if (ret == -ERESTARTSYS)
3618 ret = -EINTR;
3619 }
3620
3621 io_cqring_add_event(req, ret);
3622 if (ret < 0)
3623 req_set_fail_links(req);
014db007 3624 io_put_req(req);
fddaface 3625 return 0;
fddaface
JA
3626}
3627
52de1fe1
JA
3628static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3629{
3630 struct io_sr_msg *sr = &req->sr_msg;
3631 struct iovec __user *uiov;
3632 size_t iov_len;
3633 int ret;
3634
3635 ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
3636 &uiov, &iov_len);
3637 if (ret)
3638 return ret;
3639
3640 if (req->flags & REQ_F_BUFFER_SELECT) {
3641 if (iov_len > 1)
3642 return -EINVAL;
3643 if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
3644 return -EFAULT;
3645 sr->len = io->msg.iov[0].iov_len;
3646 iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
3647 sr->len);
3648 io->msg.iov = NULL;
3649 } else {
3650 ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
3651 &io->msg.iov, &io->msg.msg.msg_iter);
3652 if (ret > 0)
3653 ret = 0;
3654 }
3655
3656 return ret;
3657}
3658
3659#ifdef CONFIG_COMPAT
3660static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
3661 struct io_async_ctx *io)
3662{
3663 struct compat_msghdr __user *msg_compat;
3664 struct io_sr_msg *sr = &req->sr_msg;
3665 struct compat_iovec __user *uiov;
3666 compat_uptr_t ptr;
3667 compat_size_t len;
3668 int ret;
3669
3670 msg_compat = (struct compat_msghdr __user *) sr->msg;
3671 ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
3672 &ptr, &len);
3673 if (ret)
3674 return ret;
3675
3676 uiov = compat_ptr(ptr);
3677 if (req->flags & REQ_F_BUFFER_SELECT) {
3678 compat_ssize_t clen;
3679
3680 if (len > 1)
3681 return -EINVAL;
3682 if (!access_ok(uiov, sizeof(*uiov)))
3683 return -EFAULT;
3684 if (__get_user(clen, &uiov->iov_len))
3685 return -EFAULT;
3686 if (clen < 0)
3687 return -EINVAL;
3688 sr->len = io->msg.iov[0].iov_len;
3689 io->msg.iov = NULL;
3690 } else {
3691 ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
3692 &io->msg.iov,
3693 &io->msg.msg.msg_iter);
3694 if (ret < 0)
3695 return ret;
3696 }
3697
3698 return 0;
3699}
3700#endif
3701
3702static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3703{
3704 io->msg.iov = io->msg.fast_iov;
3705
3706#ifdef CONFIG_COMPAT
3707 if (req->ctx->compat)
3708 return __io_compat_recvmsg_copy_hdr(req, io);
3709#endif
3710
3711 return __io_recvmsg_copy_hdr(req, io);
3712}
3713
bcda7baa
JA
3714static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
3715 int *cflags, bool needs_lock)
3716{
3717 struct io_sr_msg *sr = &req->sr_msg;
3718 struct io_buffer *kbuf;
3719
3720 if (!(req->flags & REQ_F_BUFFER_SELECT))
3721 return NULL;
3722
3723 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
3724 if (IS_ERR(kbuf))
3725 return kbuf;
3726
3727 sr->kbuf = kbuf;
3728 req->flags |= REQ_F_BUFFER_SELECTED;
3729
3730 *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
3731 *cflags |= IORING_CQE_F_BUFFER;
3732 return kbuf;
3733}
3734
3529d8c2
JA
3735static int io_recvmsg_prep(struct io_kiocb *req,
3736 const struct io_uring_sqe *sqe)
aa1fa28f 3737{
e47293fd 3738 struct io_sr_msg *sr = &req->sr_msg;
3529d8c2 3739 struct io_async_ctx *io = req->io;
99bc4c38 3740 int ret;
3529d8c2
JA
3741
3742 sr->msg_flags = READ_ONCE(sqe->msg_flags);
3743 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 3744 sr->len = READ_ONCE(sqe->len);
bcda7baa 3745 sr->bgid = READ_ONCE(sqe->buf_group);
06b76d44 3746
d8768362
JA
3747#ifdef CONFIG_COMPAT
3748 if (req->ctx->compat)
3749 sr->msg_flags |= MSG_CMSG_COMPAT;
3750#endif
3751
fddaface 3752 if (!io || req->opcode == IORING_OP_RECV)
06b76d44 3753 return 0;
5f798bea
PB
3754 /* iovec is already imported */
3755 if (req->flags & REQ_F_NEED_CLEANUP)
3756 return 0;
03b1230c 3757
52de1fe1 3758 ret = io_recvmsg_copy_hdr(req, io);
99bc4c38
PB
3759 if (!ret)
3760 req->flags |= REQ_F_NEED_CLEANUP;
3761 return ret;
aa1fa28f
JA
3762}
3763
014db007 3764static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
aa1fa28f 3765{
0b416c3e 3766 struct io_async_msghdr *kmsg = NULL;
03b1230c 3767 struct socket *sock;
52de1fe1 3768 int ret, cflags = 0;
03b1230c
JA
3769
3770 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3771 return -EINVAL;
3772
3773 sock = sock_from_file(req->file, &ret);
3774 if (sock) {
52de1fe1 3775 struct io_buffer *kbuf;
b7bb4f7d 3776 struct io_async_ctx io;
03b1230c
JA
3777 unsigned flags;
3778
03b1230c 3779 if (req->io) {
0b416c3e 3780 kmsg = &req->io->msg;
b537916c 3781 kmsg->msg.msg_name = &req->io->msg.addr;
0b416c3e
JA
3782 /* if iov is set, it's allocated already */
3783 if (!kmsg->iov)
3784 kmsg->iov = kmsg->fast_iov;
3785 kmsg->msg.msg_iter.iov = kmsg->iov;
03b1230c 3786 } else {
0b416c3e 3787 kmsg = &io.msg;
b537916c 3788 kmsg->msg.msg_name = &io.msg.addr;
3529d8c2 3789
52de1fe1 3790 ret = io_recvmsg_copy_hdr(req, &io);
03b1230c 3791 if (ret)
3529d8c2 3792 return ret;
03b1230c
JA
3793 }
3794
52de1fe1
JA
3795 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3796 if (IS_ERR(kbuf)) {
3797 return PTR_ERR(kbuf);
3798 } else if (kbuf) {
3799 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3800 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
3801 1, req->sr_msg.len);
3802 }
3803
e47293fd
JA
3804 flags = req->sr_msg.msg_flags;
3805 if (flags & MSG_DONTWAIT)
3806 req->flags |= REQ_F_NOWAIT;
3807 else if (force_nonblock)
3808 flags |= MSG_DONTWAIT;
3809
3810 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3811 kmsg->uaddr, flags);
02d27d89
PB
3812 if (force_nonblock && ret == -EAGAIN)
3813 return io_setup_async_msg(req, kmsg);
03b1230c
JA
3814 if (ret == -ERESTARTSYS)
3815 ret = -EINTR;
3816 }
3817
1e95081c 3818 if (kmsg && kmsg->iov != kmsg->fast_iov)
0b416c3e 3819 kfree(kmsg->iov);
99bc4c38 3820 req->flags &= ~REQ_F_NEED_CLEANUP;
52de1fe1 3821 __io_cqring_add_event(req, ret, cflags);
4e88d6e7
JA
3822 if (ret < 0)
3823 req_set_fail_links(req);
014db007 3824 io_put_req(req);
03b1230c 3825 return 0;
0fa03c62 3826}
5d17b4a4 3827
014db007 3828static int io_recv(struct io_kiocb *req, bool force_nonblock)
fddaface 3829{
bcda7baa 3830 struct io_buffer *kbuf = NULL;
fddaface 3831 struct socket *sock;
bcda7baa 3832 int ret, cflags = 0;
fddaface
JA
3833
3834 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3835 return -EINVAL;
3836
3837 sock = sock_from_file(req->file, &ret);
3838 if (sock) {
3839 struct io_sr_msg *sr = &req->sr_msg;
bcda7baa 3840 void __user *buf = sr->buf;
fddaface
JA
3841 struct msghdr msg;
3842 struct iovec iov;
3843 unsigned flags;
3844
bcda7baa
JA
3845 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3846 if (IS_ERR(kbuf))
3847 return PTR_ERR(kbuf);
3848 else if (kbuf)
3849 buf = u64_to_user_ptr(kbuf->addr);
3850
3851 ret = import_single_range(READ, buf, sr->len, &iov,
fddaface 3852 &msg.msg_iter);
bcda7baa
JA
3853 if (ret) {
3854 kfree(kbuf);
fddaface 3855 return ret;
bcda7baa 3856 }
fddaface 3857
bcda7baa 3858 req->flags |= REQ_F_NEED_CLEANUP;
fddaface
JA
3859 msg.msg_name = NULL;
3860 msg.msg_control = NULL;
3861 msg.msg_controllen = 0;
3862 msg.msg_namelen = 0;
3863 msg.msg_iocb = NULL;
3864 msg.msg_flags = 0;
3865
3866 flags = req->sr_msg.msg_flags;
3867 if (flags & MSG_DONTWAIT)
3868 req->flags |= REQ_F_NOWAIT;
3869 else if (force_nonblock)
3870 flags |= MSG_DONTWAIT;
3871
0b7b21e4 3872 ret = sock_recvmsg(sock, &msg, flags);
fddaface
JA
3873 if (force_nonblock && ret == -EAGAIN)
3874 return -EAGAIN;
3875 if (ret == -ERESTARTSYS)
3876 ret = -EINTR;
3877 }
3878
bcda7baa
JA
3879 kfree(kbuf);
3880 req->flags &= ~REQ_F_NEED_CLEANUP;
3881 __io_cqring_add_event(req, ret, cflags);
fddaface
JA
3882 if (ret < 0)
3883 req_set_fail_links(req);
014db007 3884 io_put_req(req);
fddaface 3885 return 0;
fddaface
JA
3886}
3887
3529d8c2 3888static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35 3889{
8ed8d3c3
JA
3890 struct io_accept *accept = &req->accept;
3891
17f2fe35
JA
3892 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3893 return -EINVAL;
8042d6ce 3894 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
3895 return -EINVAL;
3896
d55e5f5b
JA
3897 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3898 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 3899 accept->flags = READ_ONCE(sqe->accept_flags);
8ed8d3c3 3900 return 0;
8ed8d3c3 3901}
17f2fe35 3902
014db007 3903static int __io_accept(struct io_kiocb *req, bool force_nonblock)
8ed8d3c3
JA
3904{
3905 struct io_accept *accept = &req->accept;
3906 unsigned file_flags;
3907 int ret;
3908
3909 file_flags = force_nonblock ? O_NONBLOCK : 0;
3910 ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3911 accept->addr_len, accept->flags);
3912 if (ret == -EAGAIN && force_nonblock)
17f2fe35 3913 return -EAGAIN;
8e3cca12
JA
3914 if (ret == -ERESTARTSYS)
3915 ret = -EINTR;
4e88d6e7
JA
3916 if (ret < 0)
3917 req_set_fail_links(req);
78e19bbe 3918 io_cqring_add_event(req, ret);
014db007 3919 io_put_req(req);
17f2fe35 3920 return 0;
8ed8d3c3
JA
3921}
3922
3923static void io_accept_finish(struct io_wq_work **workptr)
3924{
3925 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
8ed8d3c3
JA
3926
3927 if (io_req_cancelled(req))
3928 return;
014db007 3929 __io_accept(req, false);
e9fd9396 3930 io_steal_work(req, workptr);
8ed8d3c3 3931}
8ed8d3c3 3932
014db007 3933static int io_accept(struct io_kiocb *req, bool force_nonblock)
8ed8d3c3 3934{
8ed8d3c3
JA
3935 int ret;
3936
014db007 3937 ret = __io_accept(req, force_nonblock);
8ed8d3c3
JA
3938 if (ret == -EAGAIN && force_nonblock) {
3939 req->work.func = io_accept_finish;
8ed8d3c3
JA
3940 return -EAGAIN;
3941 }
3942 return 0;
0fa03c62 3943}
5d17b4a4 3944
3529d8c2 3945static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021 3946{
3529d8c2
JA
3947 struct io_connect *conn = &req->connect;
3948 struct io_async_ctx *io = req->io;
f499a021 3949
3fbb51c1
JA
3950 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3951 return -EINVAL;
3952 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3953 return -EINVAL;
3954
3529d8c2
JA
3955 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3956 conn->addr_len = READ_ONCE(sqe->addr2);
3957
3958 if (!io)
3959 return 0;
3960
3961 return move_addr_to_kernel(conn->addr, conn->addr_len,
3fbb51c1 3962 &io->connect.address);
f499a021
JA
3963}
3964
014db007 3965static int io_connect(struct io_kiocb *req, bool force_nonblock)
f8e85cf2 3966{
f499a021 3967 struct io_async_ctx __io, *io;
f8e85cf2 3968 unsigned file_flags;
3fbb51c1 3969 int ret;
f8e85cf2 3970
f499a021
JA
3971 if (req->io) {
3972 io = req->io;
3973 } else {
3529d8c2
JA
3974 ret = move_addr_to_kernel(req->connect.addr,
3975 req->connect.addr_len,
3976 &__io.connect.address);
f499a021
JA
3977 if (ret)
3978 goto out;
3979 io = &__io;
3980 }
3981
3fbb51c1
JA
3982 file_flags = force_nonblock ? O_NONBLOCK : 0;
3983
3984 ret = __sys_connect_file(req->file, &io->connect.address,
3985 req->connect.addr_len, file_flags);
87f80d62 3986 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
b7bb4f7d
JA
3987 if (req->io)
3988 return -EAGAIN;
3989 if (io_alloc_async_ctx(req)) {
f499a021
JA
3990 ret = -ENOMEM;
3991 goto out;
3992 }
b7bb4f7d 3993 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
f8e85cf2 3994 return -EAGAIN;
f499a021 3995 }
f8e85cf2
JA
3996 if (ret == -ERESTARTSYS)
3997 ret = -EINTR;
f499a021 3998out:
4e88d6e7
JA
3999 if (ret < 0)
4000 req_set_fail_links(req);
f8e85cf2 4001 io_cqring_add_event(req, ret);
014db007 4002 io_put_req(req);
f8e85cf2 4003 return 0;
469956e8
Y
4004}
4005#else /* !CONFIG_NET */
4006static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4007{
4008 return -EOPNOTSUPP;
4009}
4010
4011static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
4012{
4013 return -EOPNOTSUPP;
4014}
4015
4016static int io_send(struct io_kiocb *req, bool force_nonblock)
4017{
4018 return -EOPNOTSUPP;
4019}
4020
4021static int io_recvmsg_prep(struct io_kiocb *req,
4022 const struct io_uring_sqe *sqe)
4023{
4024 return -EOPNOTSUPP;
4025}
4026
4027static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
4028{
4029 return -EOPNOTSUPP;
4030}
4031
4032static int io_recv(struct io_kiocb *req, bool force_nonblock)
4033{
4034 return -EOPNOTSUPP;
4035}
4036
4037static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4038{
4039 return -EOPNOTSUPP;
4040}
4041
4042static int io_accept(struct io_kiocb *req, bool force_nonblock)
4043{
4044 return -EOPNOTSUPP;
4045}
4046
4047static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4048{
4049 return -EOPNOTSUPP;
4050}
4051
4052static int io_connect(struct io_kiocb *req, bool force_nonblock)
4053{
f8e85cf2 4054 return -EOPNOTSUPP;
f8e85cf2 4055}
469956e8 4056#endif /* CONFIG_NET */
f8e85cf2 4057
d7718a9d
JA
4058struct io_poll_table {
4059 struct poll_table_struct pt;
4060 struct io_kiocb *req;
4061 int error;
4062};
4063
4064static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4065 struct wait_queue_head *head)
4066{
4067 if (unlikely(poll->head)) {
4068 pt->error = -EINVAL;
4069 return;
4070 }
4071
4072 pt->error = 0;
4073 poll->head = head;
4074 add_wait_queue(head, &poll->wait);
4075}
4076
4077static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4078 struct poll_table_struct *p)
4079{
4080 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4081
4082 __io_queue_proc(&pt->req->apoll->poll, pt, head);
4083}
4084
4085static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4086 __poll_t mask, task_work_func_t func)
4087{
4088 struct task_struct *tsk;
4089
4090 /* for instances that support it check for an event match first: */
4091 if (mask && !(mask & poll->events))
4092 return 0;
4093
4094 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4095
4096 list_del_init(&poll->wait.entry);
4097
4098 tsk = req->task;
4099 req->result = mask;
4100 init_task_work(&req->task_work, func);
4101 /*
4102 * If this fails, then the task is exiting. If that is the case, then
4103 * the exit check will ultimately cancel these work items. Hence we
4104 * don't need to check here and handle it specifically.
4105 */
4106 task_work_add(tsk, &req->task_work, true);
4107 wake_up_process(tsk);
4108 return 1;
4109}
4110
4111static void io_async_task_func(struct callback_head *cb)
4112{
4113 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4114 struct async_poll *apoll = req->apoll;
4115 struct io_ring_ctx *ctx = req->ctx;
4116
4117 trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
4118
4119 WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
4120
4121 if (hash_hashed(&req->hash_node)) {
4122 spin_lock_irq(&ctx->completion_lock);
4123 hash_del(&req->hash_node);
4124 spin_unlock_irq(&ctx->completion_lock);
4125 }
4126
4127 /* restore ->work in case we need to retry again */
4128 memcpy(&req->work, &apoll->work, sizeof(req->work));
4129
4130 __set_current_state(TASK_RUNNING);
4131 mutex_lock(&ctx->uring_lock);
4132 __io_queue_sqe(req, NULL);
4133 mutex_unlock(&ctx->uring_lock);
4134
4135 kfree(apoll);
4136}
4137
4138static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4139 void *key)
4140{
4141 struct io_kiocb *req = wait->private;
4142 struct io_poll_iocb *poll = &req->apoll->poll;
4143
4144 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
4145 key_to_poll(key));
4146
4147 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
4148}
4149
4150static void io_poll_req_insert(struct io_kiocb *req)
4151{
4152 struct io_ring_ctx *ctx = req->ctx;
4153 struct hlist_head *list;
4154
4155 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
4156 hlist_add_head(&req->hash_node, list);
4157}
4158
4159static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
4160 struct io_poll_iocb *poll,
4161 struct io_poll_table *ipt, __poll_t mask,
4162 wait_queue_func_t wake_func)
4163 __acquires(&ctx->completion_lock)
4164{
4165 struct io_ring_ctx *ctx = req->ctx;
4166 bool cancel = false;
4167
4168 poll->file = req->file;
4169 poll->head = NULL;
4170 poll->done = poll->canceled = false;
4171 poll->events = mask;
4172
4173 ipt->pt._key = mask;
4174 ipt->req = req;
4175 ipt->error = -EINVAL;
4176
4177 INIT_LIST_HEAD(&poll->wait.entry);
4178 init_waitqueue_func_entry(&poll->wait, wake_func);
4179 poll->wait.private = req;
4180
4181 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
4182
4183 spin_lock_irq(&ctx->completion_lock);
4184 if (likely(poll->head)) {
4185 spin_lock(&poll->head->lock);
4186 if (unlikely(list_empty(&poll->wait.entry))) {
4187 if (ipt->error)
4188 cancel = true;
4189 ipt->error = 0;
4190 mask = 0;
4191 }
4192 if (mask || ipt->error)
4193 list_del_init(&poll->wait.entry);
4194 else if (cancel)
4195 WRITE_ONCE(poll->canceled, true);
4196 else if (!poll->done) /* actually waiting for an event */
4197 io_poll_req_insert(req);
4198 spin_unlock(&poll->head->lock);
4199 }
4200
4201 return mask;
4202}
4203
4204static bool io_arm_poll_handler(struct io_kiocb *req)
4205{
4206 const struct io_op_def *def = &io_op_defs[req->opcode];
4207 struct io_ring_ctx *ctx = req->ctx;
4208 struct async_poll *apoll;
4209 struct io_poll_table ipt;
4210 __poll_t mask, ret;
4211
4212 if (!req->file || !file_can_poll(req->file))
4213 return false;
4214 if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
4215 return false;
4216 if (!def->pollin && !def->pollout)
4217 return false;
4218
4219 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
4220 if (unlikely(!apoll))
4221 return false;
4222
4223 req->flags |= REQ_F_POLLED;
4224 memcpy(&apoll->work, &req->work, sizeof(req->work));
4225
4226 /*
4227 * Don't need a reference here, as we're adding it to the task
4228 * task_works list. If the task exits, the list is pruned.
4229 */
4230 req->task = current;
4231 req->apoll = apoll;
4232 INIT_HLIST_NODE(&req->hash_node);
4233
8755d97a 4234 mask = 0;
d7718a9d 4235 if (def->pollin)
8755d97a 4236 mask |= POLLIN | POLLRDNORM;
d7718a9d
JA
4237 if (def->pollout)
4238 mask |= POLLOUT | POLLWRNORM;
4239 mask |= POLLERR | POLLPRI;
4240
4241 ipt.pt._qproc = io_async_queue_proc;
4242
4243 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
4244 io_async_wake);
4245 if (ret) {
4246 ipt.error = 0;
4247 apoll->poll.done = true;
4248 spin_unlock_irq(&ctx->completion_lock);
4249 memcpy(&req->work, &apoll->work, sizeof(req->work));
4250 kfree(apoll);
4251 return false;
4252 }
4253 spin_unlock_irq(&ctx->completion_lock);
4254 trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
4255 apoll->poll.events);
4256 return true;
4257}
4258
4259static bool __io_poll_remove_one(struct io_kiocb *req,
4260 struct io_poll_iocb *poll)
221c5eb2 4261{
b41e9852 4262 bool do_complete = false;
221c5eb2
JA
4263
4264 spin_lock(&poll->head->lock);
4265 WRITE_ONCE(poll->canceled, true);
392edb45
JA
4266 if (!list_empty(&poll->wait.entry)) {
4267 list_del_init(&poll->wait.entry);
b41e9852 4268 do_complete = true;
221c5eb2
JA
4269 }
4270 spin_unlock(&poll->head->lock);
d7718a9d
JA
4271 return do_complete;
4272}
4273
4274static bool io_poll_remove_one(struct io_kiocb *req)
4275{
4276 bool do_complete;
4277
4278 if (req->opcode == IORING_OP_POLL_ADD) {
4279 do_complete = __io_poll_remove_one(req, &req->poll);
4280 } else {
4281 /* non-poll requests have submit ref still */
4282 do_complete = __io_poll_remove_one(req, &req->apoll->poll);
4283 if (do_complete)
4284 io_put_req(req);
4285 }
4286
78076bb6 4287 hash_del(&req->hash_node);
d7718a9d 4288
b41e9852
JA
4289 if (do_complete) {
4290 io_cqring_fill_event(req, -ECANCELED);
4291 io_commit_cqring(req->ctx);
4292 req->flags |= REQ_F_COMP_LOCKED;
4293 io_put_req(req);
4294 }
4295
4296 return do_complete;
221c5eb2
JA
4297}
4298
4299static void io_poll_remove_all(struct io_ring_ctx *ctx)
4300{
78076bb6 4301 struct hlist_node *tmp;
221c5eb2 4302 struct io_kiocb *req;
78076bb6 4303 int i;
221c5eb2
JA
4304
4305 spin_lock_irq(&ctx->completion_lock);
78076bb6
JA
4306 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
4307 struct hlist_head *list;
4308
4309 list = &ctx->cancel_hash[i];
4310 hlist_for_each_entry_safe(req, tmp, list, hash_node)
4311 io_poll_remove_one(req);
221c5eb2
JA
4312 }
4313 spin_unlock_irq(&ctx->completion_lock);
b41e9852
JA
4314
4315 io_cqring_ev_posted(ctx);
221c5eb2
JA
4316}
4317
47f46768
JA
4318static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
4319{
78076bb6 4320 struct hlist_head *list;
47f46768
JA
4321 struct io_kiocb *req;
4322
78076bb6
JA
4323 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
4324 hlist_for_each_entry(req, list, hash_node) {
b41e9852
JA
4325 if (sqe_addr != req->user_data)
4326 continue;
4327 if (io_poll_remove_one(req))
eac406c6 4328 return 0;
b41e9852 4329 return -EALREADY;
47f46768
JA
4330 }
4331
4332 return -ENOENT;
4333}
4334
3529d8c2
JA
4335static int io_poll_remove_prep(struct io_kiocb *req,
4336 const struct io_uring_sqe *sqe)
0969e783 4337{
0969e783
JA
4338 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4339 return -EINVAL;
4340 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4341 sqe->poll_events)
4342 return -EINVAL;
4343
4344 req->poll.addr = READ_ONCE(sqe->addr);
0969e783
JA
4345 return 0;
4346}
4347
221c5eb2
JA
4348/*
4349 * Find a running poll command that matches one specified in sqe->addr,
4350 * and remove it if found.
4351 */
fc4df999 4352static int io_poll_remove(struct io_kiocb *req)
221c5eb2
JA
4353{
4354 struct io_ring_ctx *ctx = req->ctx;
0969e783 4355 u64 addr;
47f46768 4356 int ret;
221c5eb2 4357
0969e783 4358 addr = req->poll.addr;
221c5eb2 4359 spin_lock_irq(&ctx->completion_lock);
0969e783 4360 ret = io_poll_cancel(ctx, addr);
221c5eb2
JA
4361 spin_unlock_irq(&ctx->completion_lock);
4362
78e19bbe 4363 io_cqring_add_event(req, ret);
4e88d6e7
JA
4364 if (ret < 0)
4365 req_set_fail_links(req);
e65ef56d 4366 io_put_req(req);
221c5eb2
JA
4367 return 0;
4368}
4369
b0dd8a41 4370static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
221c5eb2 4371{
a197f664
JL
4372 struct io_ring_ctx *ctx = req->ctx;
4373
8c838788 4374 req->poll.done = true;
b0a20349 4375 io_cqring_fill_event(req, error ? error : mangle_poll(mask));
8c838788 4376 io_commit_cqring(ctx);
221c5eb2
JA
4377}
4378
b41e9852 4379static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
221c5eb2 4380{
221c5eb2 4381 struct io_ring_ctx *ctx = req->ctx;
221c5eb2 4382
221c5eb2 4383 spin_lock_irq(&ctx->completion_lock);
78076bb6 4384 hash_del(&req->hash_node);
b41e9852
JA
4385 io_poll_complete(req, req->result, 0);
4386 req->flags |= REQ_F_COMP_LOCKED;
4387 io_put_req_find_next(req, nxt);
e94f141b
JA
4388 spin_unlock_irq(&ctx->completion_lock);
4389
4390 io_cqring_ev_posted(ctx);
e94f141b
JA
4391}
4392
b41e9852 4393static void io_poll_task_func(struct callback_head *cb)
f0b493e6 4394{
b41e9852
JA
4395 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4396 struct io_kiocb *nxt = NULL;
f0b493e6 4397
b41e9852 4398 io_poll_task_handler(req, &nxt);
d7718a9d
JA
4399 if (nxt) {
4400 struct io_ring_ctx *ctx = nxt->ctx;
4401
4402 mutex_lock(&ctx->uring_lock);
b41e9852 4403 __io_queue_sqe(nxt, NULL);
d7718a9d
JA
4404 mutex_unlock(&ctx->uring_lock);
4405 }
f0b493e6
JA
4406}
4407
221c5eb2
JA
4408static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4409 void *key)
4410{
c2f2eb7d
JA
4411 struct io_kiocb *req = wait->private;
4412 struct io_poll_iocb *poll = &req->poll;
221c5eb2 4413
d7718a9d 4414 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
221c5eb2
JA
4415}
4416
221c5eb2
JA
4417static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
4418 struct poll_table_struct *p)
4419{
4420 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4421
d7718a9d 4422 __io_queue_proc(&pt->req->poll, pt, head);
eac406c6
JA
4423}
4424
3529d8c2 4425static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
4426{
4427 struct io_poll_iocb *poll = &req->poll;
221c5eb2 4428 u16 events;
221c5eb2
JA
4429
4430 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4431 return -EINVAL;
4432 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
4433 return -EINVAL;
09bb8394
JA
4434 if (!poll->file)
4435 return -EBADF;
221c5eb2 4436
221c5eb2
JA
4437 events = READ_ONCE(sqe->poll_events);
4438 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
b41e9852 4439
d7718a9d
JA
4440 /*
4441 * Don't need a reference here, as we're adding it to the task
4442 * task_works list. If the task exits, the list is pruned.
4443 */
b41e9852 4444 req->task = current;
0969e783
JA
4445 return 0;
4446}
4447
014db007 4448static int io_poll_add(struct io_kiocb *req)
0969e783
JA
4449{
4450 struct io_poll_iocb *poll = &req->poll;
4451 struct io_ring_ctx *ctx = req->ctx;
4452 struct io_poll_table ipt;
0969e783 4453 __poll_t mask;
0969e783 4454
78076bb6 4455 INIT_HLIST_NODE(&req->hash_node);
36703247 4456 INIT_LIST_HEAD(&req->list);
d7718a9d 4457 ipt.pt._qproc = io_poll_queue_proc;
36703247 4458
d7718a9d
JA
4459 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
4460 io_poll_wake);
221c5eb2 4461
8c838788 4462 if (mask) { /* no async, we'd stolen it */
221c5eb2 4463 ipt.error = 0;
b0dd8a41 4464 io_poll_complete(req, mask, 0);
221c5eb2 4465 }
221c5eb2
JA
4466 spin_unlock_irq(&ctx->completion_lock);
4467
8c838788
JA
4468 if (mask) {
4469 io_cqring_ev_posted(ctx);
014db007 4470 io_put_req(req);
221c5eb2 4471 }
8c838788 4472 return ipt.error;
221c5eb2
JA
4473}
4474
5262f567
JA
4475static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
4476{
ad8a48ac
JA
4477 struct io_timeout_data *data = container_of(timer,
4478 struct io_timeout_data, timer);
4479 struct io_kiocb *req = data->req;
4480 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
4481 unsigned long flags;
4482
5262f567
JA
4483 atomic_inc(&ctx->cq_timeouts);
4484
4485 spin_lock_irqsave(&ctx->completion_lock, flags);
ef03681a 4486 /*
11365043
JA
4487 * We could be racing with timeout deletion. If the list is empty,
4488 * then timeout lookup already found it and will be handling it.
ef03681a 4489 */
842f9612 4490 if (!list_empty(&req->list)) {
11365043 4491 struct io_kiocb *prev;
5262f567 4492
11365043
JA
4493 /*
4494 * Adjust the reqs sequence before the current one because it
d195a66e 4495 * will consume a slot in the cq_ring and the cq_tail
11365043
JA
4496 * pointer will be increased, otherwise other timeout reqs may
4497 * return in advance without waiting for enough wait_nr.
4498 */
4499 prev = req;
4500 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
4501 prev->sequence++;
11365043 4502 list_del_init(&req->list);
11365043 4503 }
5262f567 4504
78e19bbe 4505 io_cqring_fill_event(req, -ETIME);
5262f567
JA
4506 io_commit_cqring(ctx);
4507 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4508
4509 io_cqring_ev_posted(ctx);
4e88d6e7 4510 req_set_fail_links(req);
5262f567
JA
4511 io_put_req(req);
4512 return HRTIMER_NORESTART;
4513}
4514
47f46768
JA
4515static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
4516{
4517 struct io_kiocb *req;
4518 int ret = -ENOENT;
4519
4520 list_for_each_entry(req, &ctx->timeout_list, list) {
4521 if (user_data == req->user_data) {
4522 list_del_init(&req->list);
4523 ret = 0;
4524 break;
4525 }
4526 }
4527
4528 if (ret == -ENOENT)
4529 return ret;
4530
2d28390a 4531 ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
47f46768
JA
4532 if (ret == -1)
4533 return -EALREADY;
4534
4e88d6e7 4535 req_set_fail_links(req);
47f46768
JA
4536 io_cqring_fill_event(req, -ECANCELED);
4537 io_put_req(req);
4538 return 0;
4539}
4540
3529d8c2
JA
4541static int io_timeout_remove_prep(struct io_kiocb *req,
4542 const struct io_uring_sqe *sqe)
b29472ee 4543{
b29472ee
JA
4544 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4545 return -EINVAL;
4546 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
4547 return -EINVAL;
4548
4549 req->timeout.addr = READ_ONCE(sqe->addr);
4550 req->timeout.flags = READ_ONCE(sqe->timeout_flags);
4551 if (req->timeout.flags)
4552 return -EINVAL;
4553
b29472ee
JA
4554 return 0;
4555}
4556
11365043
JA
4557/*
4558 * Remove or update an existing timeout command
4559 */
fc4df999 4560static int io_timeout_remove(struct io_kiocb *req)
11365043
JA
4561{
4562 struct io_ring_ctx *ctx = req->ctx;
47f46768 4563 int ret;
11365043 4564
11365043 4565 spin_lock_irq(&ctx->completion_lock);
b29472ee 4566 ret = io_timeout_cancel(ctx, req->timeout.addr);
11365043 4567
47f46768 4568 io_cqring_fill_event(req, ret);
11365043
JA
4569 io_commit_cqring(ctx);
4570 spin_unlock_irq(&ctx->completion_lock);
5262f567 4571 io_cqring_ev_posted(ctx);
4e88d6e7
JA
4572 if (ret < 0)
4573 req_set_fail_links(req);
ec9c02ad 4574 io_put_req(req);
11365043 4575 return 0;
5262f567
JA
4576}
4577
3529d8c2 4578static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 4579 bool is_timeout_link)
5262f567 4580{
ad8a48ac 4581 struct io_timeout_data *data;
a41525ab 4582 unsigned flags;
5262f567 4583
ad8a48ac 4584 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 4585 return -EINVAL;
ad8a48ac 4586 if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
a41525ab 4587 return -EINVAL;
2d28390a
JA
4588 if (sqe->off && is_timeout_link)
4589 return -EINVAL;
a41525ab
JA
4590 flags = READ_ONCE(sqe->timeout_flags);
4591 if (flags & ~IORING_TIMEOUT_ABS)
5262f567 4592 return -EINVAL;
bdf20073 4593
26a61679
JA
4594 req->timeout.count = READ_ONCE(sqe->off);
4595
3529d8c2 4596 if (!req->io && io_alloc_async_ctx(req))
26a61679
JA
4597 return -ENOMEM;
4598
4599 data = &req->io->timeout;
ad8a48ac 4600 data->req = req;
ad8a48ac
JA
4601 req->flags |= REQ_F_TIMEOUT;
4602
4603 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
4604 return -EFAULT;
4605
11365043 4606 if (flags & IORING_TIMEOUT_ABS)
ad8a48ac 4607 data->mode = HRTIMER_MODE_ABS;
11365043 4608 else
ad8a48ac 4609 data->mode = HRTIMER_MODE_REL;
11365043 4610
ad8a48ac
JA
4611 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
4612 return 0;
4613}
4614
fc4df999 4615static int io_timeout(struct io_kiocb *req)
ad8a48ac
JA
4616{
4617 unsigned count;
4618 struct io_ring_ctx *ctx = req->ctx;
4619 struct io_timeout_data *data;
4620 struct list_head *entry;
4621 unsigned span = 0;
ad8a48ac 4622
2d28390a 4623 data = &req->io->timeout;
93bd25bb 4624
5262f567
JA
4625 /*
4626 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
4627 * timeout event to be satisfied. If it isn't set, then this is
4628 * a pure timeout request, sequence isn't used.
5262f567 4629 */
26a61679 4630 count = req->timeout.count;
93bd25bb
JA
4631 if (!count) {
4632 req->flags |= REQ_F_TIMEOUT_NOSEQ;
4633 spin_lock_irq(&ctx->completion_lock);
4634 entry = ctx->timeout_list.prev;
4635 goto add;
4636 }
5262f567
JA
4637
4638 req->sequence = ctx->cached_sq_head + count - 1;
2d28390a 4639 data->seq_offset = count;
5262f567
JA
4640
4641 /*
4642 * Insertion sort, ensuring the first entry in the list is always
4643 * the one we need first.
4644 */
5262f567
JA
4645 spin_lock_irq(&ctx->completion_lock);
4646 list_for_each_prev(entry, &ctx->timeout_list) {
4647 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
5da0fb1a 4648 unsigned nxt_sq_head;
4649 long long tmp, tmp_nxt;
2d28390a 4650 u32 nxt_offset = nxt->io->timeout.seq_offset;
5262f567 4651
93bd25bb
JA
4652 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
4653 continue;
4654
5da0fb1a 4655 /*
4656 * Since cached_sq_head + count - 1 can overflow, use type long
4657 * long to store it.
4658 */
4659 tmp = (long long)ctx->cached_sq_head + count - 1;
cc42e0ac
PB
4660 nxt_sq_head = nxt->sequence - nxt_offset + 1;
4661 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
5da0fb1a 4662
4663 /*
4664 * cached_sq_head may overflow, and it will never overflow twice
4665 * once there is some timeout req still be valid.
4666 */
4667 if (ctx->cached_sq_head < nxt_sq_head)
8b07a65a 4668 tmp += UINT_MAX;
5da0fb1a 4669
a1f58ba4 4670 if (tmp > tmp_nxt)
5262f567 4671 break;
a1f58ba4 4672
4673 /*
4674 * Sequence of reqs after the insert one and itself should
4675 * be adjusted because each timeout req consumes a slot.
4676 */
4677 span++;
4678 nxt->sequence++;
5262f567 4679 }
a1f58ba4 4680 req->sequence -= span;
93bd25bb 4681add:
5262f567 4682 list_add(&req->list, entry);
ad8a48ac
JA
4683 data->timer.function = io_timeout_fn;
4684 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5262f567 4685 spin_unlock_irq(&ctx->completion_lock);
5262f567
JA
4686 return 0;
4687}
5262f567 4688
62755e35
JA
4689static bool io_cancel_cb(struct io_wq_work *work, void *data)
4690{
4691 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4692
4693 return req->user_data == (unsigned long) data;
4694}
4695
e977d6d3 4696static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
62755e35 4697{
62755e35 4698 enum io_wq_cancel cancel_ret;
62755e35
JA
4699 int ret = 0;
4700
62755e35
JA
4701 cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
4702 switch (cancel_ret) {
4703 case IO_WQ_CANCEL_OK:
4704 ret = 0;
4705 break;
4706 case IO_WQ_CANCEL_RUNNING:
4707 ret = -EALREADY;
4708 break;
4709 case IO_WQ_CANCEL_NOTFOUND:
4710 ret = -ENOENT;
4711 break;
4712 }
4713
e977d6d3
JA
4714 return ret;
4715}
4716
47f46768
JA
4717static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
4718 struct io_kiocb *req, __u64 sqe_addr,
014db007 4719 int success_ret)
47f46768
JA
4720{
4721 unsigned long flags;
4722 int ret;
4723
4724 ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
4725 if (ret != -ENOENT) {
4726 spin_lock_irqsave(&ctx->completion_lock, flags);
4727 goto done;
4728 }
4729
4730 spin_lock_irqsave(&ctx->completion_lock, flags);
4731 ret = io_timeout_cancel(ctx, sqe_addr);
4732 if (ret != -ENOENT)
4733 goto done;
4734 ret = io_poll_cancel(ctx, sqe_addr);
4735done:
b0dd8a41
JA
4736 if (!ret)
4737 ret = success_ret;
47f46768
JA
4738 io_cqring_fill_event(req, ret);
4739 io_commit_cqring(ctx);
4740 spin_unlock_irqrestore(&ctx->completion_lock, flags);
4741 io_cqring_ev_posted(ctx);
4742
4e88d6e7
JA
4743 if (ret < 0)
4744 req_set_fail_links(req);
014db007 4745 io_put_req(req);
47f46768
JA
4746}
4747
3529d8c2
JA
4748static int io_async_cancel_prep(struct io_kiocb *req,
4749 const struct io_uring_sqe *sqe)
e977d6d3 4750{
fbf23849 4751 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3
JA
4752 return -EINVAL;
4753 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
4754 sqe->cancel_flags)
4755 return -EINVAL;
4756
fbf23849
JA
4757 req->cancel.addr = READ_ONCE(sqe->addr);
4758 return 0;
4759}
4760
014db007 4761static int io_async_cancel(struct io_kiocb *req)
fbf23849
JA
4762{
4763 struct io_ring_ctx *ctx = req->ctx;
fbf23849 4764
014db007 4765 io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5262f567
JA
4766 return 0;
4767}
4768
05f3fb3c
JA
4769static int io_files_update_prep(struct io_kiocb *req,
4770 const struct io_uring_sqe *sqe)
4771{
4772 if (sqe->flags || sqe->ioprio || sqe->rw_flags)
4773 return -EINVAL;
4774
4775 req->files_update.offset = READ_ONCE(sqe->off);
4776 req->files_update.nr_args = READ_ONCE(sqe->len);
4777 if (!req->files_update.nr_args)
4778 return -EINVAL;
4779 req->files_update.arg = READ_ONCE(sqe->addr);
4780 return 0;
4781}
4782
4783static int io_files_update(struct io_kiocb *req, bool force_nonblock)
fbf23849
JA
4784{
4785 struct io_ring_ctx *ctx = req->ctx;
05f3fb3c
JA
4786 struct io_uring_files_update up;
4787 int ret;
fbf23849 4788
f86cd20c 4789 if (force_nonblock)
05f3fb3c 4790 return -EAGAIN;
05f3fb3c
JA
4791
4792 up.offset = req->files_update.offset;
4793 up.fds = req->files_update.arg;
4794
4795 mutex_lock(&ctx->uring_lock);
4796 ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
4797 mutex_unlock(&ctx->uring_lock);
4798
4799 if (ret < 0)
4800 req_set_fail_links(req);
4801 io_cqring_add_event(req, ret);
4802 io_put_req(req);
5262f567
JA
4803 return 0;
4804}
4805
3529d8c2
JA
4806static int io_req_defer_prep(struct io_kiocb *req,
4807 const struct io_uring_sqe *sqe)
f67676d1 4808{
e781573e 4809 ssize_t ret = 0;
f67676d1 4810
f86cd20c
JA
4811 if (io_op_defs[req->opcode].file_table) {
4812 ret = io_grab_files(req);
4813 if (unlikely(ret))
4814 return ret;
4815 }
4816
cccf0ee8
JA
4817 io_req_work_grab_env(req, &io_op_defs[req->opcode]);
4818
d625c6ee 4819 switch (req->opcode) {
e781573e
JA
4820 case IORING_OP_NOP:
4821 break;
f67676d1
JA
4822 case IORING_OP_READV:
4823 case IORING_OP_READ_FIXED:
3a6820f2 4824 case IORING_OP_READ:
3529d8c2 4825 ret = io_read_prep(req, sqe, true);
f67676d1
JA
4826 break;
4827 case IORING_OP_WRITEV:
4828 case IORING_OP_WRITE_FIXED:
3a6820f2 4829 case IORING_OP_WRITE:
3529d8c2 4830 ret = io_write_prep(req, sqe, true);
f67676d1 4831 break;
0969e783 4832 case IORING_OP_POLL_ADD:
3529d8c2 4833 ret = io_poll_add_prep(req, sqe);
0969e783
JA
4834 break;
4835 case IORING_OP_POLL_REMOVE:
3529d8c2 4836 ret = io_poll_remove_prep(req, sqe);
0969e783 4837 break;
8ed8d3c3 4838 case IORING_OP_FSYNC:
3529d8c2 4839 ret = io_prep_fsync(req, sqe);
8ed8d3c3
JA
4840 break;
4841 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2 4842 ret = io_prep_sfr(req, sqe);
8ed8d3c3 4843 break;
03b1230c 4844 case IORING_OP_SENDMSG:
fddaface 4845 case IORING_OP_SEND:
3529d8c2 4846 ret = io_sendmsg_prep(req, sqe);
03b1230c
JA
4847 break;
4848 case IORING_OP_RECVMSG:
fddaface 4849 case IORING_OP_RECV:
3529d8c2 4850 ret = io_recvmsg_prep(req, sqe);
03b1230c 4851 break;
f499a021 4852 case IORING_OP_CONNECT:
3529d8c2 4853 ret = io_connect_prep(req, sqe);
f499a021 4854 break;
2d28390a 4855 case IORING_OP_TIMEOUT:
3529d8c2 4856 ret = io_timeout_prep(req, sqe, false);
b7bb4f7d 4857 break;
b29472ee 4858 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2 4859 ret = io_timeout_remove_prep(req, sqe);
b29472ee 4860 break;
fbf23849 4861 case IORING_OP_ASYNC_CANCEL:
3529d8c2 4862 ret = io_async_cancel_prep(req, sqe);
fbf23849 4863 break;
2d28390a 4864 case IORING_OP_LINK_TIMEOUT:
3529d8c2 4865 ret = io_timeout_prep(req, sqe, true);
b7bb4f7d 4866 break;
8ed8d3c3 4867 case IORING_OP_ACCEPT:
3529d8c2 4868 ret = io_accept_prep(req, sqe);
8ed8d3c3 4869 break;
d63d1b5e
JA
4870 case IORING_OP_FALLOCATE:
4871 ret = io_fallocate_prep(req, sqe);
4872 break;
15b71abe
JA
4873 case IORING_OP_OPENAT:
4874 ret = io_openat_prep(req, sqe);
4875 break;
b5dba59e
JA
4876 case IORING_OP_CLOSE:
4877 ret = io_close_prep(req, sqe);
4878 break;
05f3fb3c
JA
4879 case IORING_OP_FILES_UPDATE:
4880 ret = io_files_update_prep(req, sqe);
4881 break;
eddc7ef5
JA
4882 case IORING_OP_STATX:
4883 ret = io_statx_prep(req, sqe);
4884 break;
4840e418
JA
4885 case IORING_OP_FADVISE:
4886 ret = io_fadvise_prep(req, sqe);
4887 break;
c1ca757b
JA
4888 case IORING_OP_MADVISE:
4889 ret = io_madvise_prep(req, sqe);
4890 break;
cebdb986
JA
4891 case IORING_OP_OPENAT2:
4892 ret = io_openat2_prep(req, sqe);
4893 break;
3e4827b0
JA
4894 case IORING_OP_EPOLL_CTL:
4895 ret = io_epoll_ctl_prep(req, sqe);
4896 break;
7d67af2c
PB
4897 case IORING_OP_SPLICE:
4898 ret = io_splice_prep(req, sqe);
4899 break;
ddf0322d
JA
4900 case IORING_OP_PROVIDE_BUFFERS:
4901 ret = io_provide_buffers_prep(req, sqe);
4902 break;
067524e9
JA
4903 case IORING_OP_REMOVE_BUFFERS:
4904 ret = io_remove_buffers_prep(req, sqe);
4905 break;
f67676d1 4906 default:
e781573e
JA
4907 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
4908 req->opcode);
4909 ret = -EINVAL;
b7bb4f7d 4910 break;
f67676d1
JA
4911 }
4912
b7bb4f7d 4913 return ret;
f67676d1
JA
4914}
4915
3529d8c2 4916static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
de0617e4 4917{
a197f664 4918 struct io_ring_ctx *ctx = req->ctx;
f67676d1 4919 int ret;
de0617e4 4920
9d858b21
BL
4921 /* Still need defer if there is pending req in defer list. */
4922 if (!req_need_defer(req) && list_empty(&ctx->defer_list))
de0617e4
JA
4923 return 0;
4924
3529d8c2 4925 if (!req->io && io_alloc_async_ctx(req))
de0617e4
JA
4926 return -EAGAIN;
4927
3529d8c2 4928 ret = io_req_defer_prep(req, sqe);
b7bb4f7d 4929 if (ret < 0)
2d28390a 4930 return ret;
2d28390a 4931
de0617e4 4932 spin_lock_irq(&ctx->completion_lock);
9d858b21 4933 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
de0617e4 4934 spin_unlock_irq(&ctx->completion_lock);
de0617e4
JA
4935 return 0;
4936 }
4937
915967f6 4938 trace_io_uring_defer(ctx, req, req->user_data);
de0617e4
JA
4939 list_add_tail(&req->list, &ctx->defer_list);
4940 spin_unlock_irq(&ctx->completion_lock);
4941 return -EIOCBQUEUED;
4942}
4943
99bc4c38
PB
4944static void io_cleanup_req(struct io_kiocb *req)
4945{
4946 struct io_async_ctx *io = req->io;
4947
4948 switch (req->opcode) {
4949 case IORING_OP_READV:
4950 case IORING_OP_READ_FIXED:
4951 case IORING_OP_READ:
bcda7baa
JA
4952 if (req->flags & REQ_F_BUFFER_SELECTED)
4953 kfree((void *)(unsigned long)req->rw.addr);
4954 /* fallthrough */
99bc4c38
PB
4955 case IORING_OP_WRITEV:
4956 case IORING_OP_WRITE_FIXED:
4957 case IORING_OP_WRITE:
4958 if (io->rw.iov != io->rw.fast_iov)
4959 kfree(io->rw.iov);
4960 break;
99bc4c38 4961 case IORING_OP_RECVMSG:
52de1fe1
JA
4962 if (req->flags & REQ_F_BUFFER_SELECTED)
4963 kfree(req->sr_msg.kbuf);
4964 /* fallthrough */
4965 case IORING_OP_SENDMSG:
99bc4c38
PB
4966 if (io->msg.iov != io->msg.fast_iov)
4967 kfree(io->msg.iov);
4968 break;
bcda7baa
JA
4969 case IORING_OP_RECV:
4970 if (req->flags & REQ_F_BUFFER_SELECTED)
4971 kfree(req->sr_msg.kbuf);
4972 break;
8fef80bf
PB
4973 case IORING_OP_OPENAT:
4974 case IORING_OP_OPENAT2:
4975 case IORING_OP_STATX:
4976 putname(req->open.filename);
4977 break;
7d67af2c
PB
4978 case IORING_OP_SPLICE:
4979 io_put_file(req, req->splice.file_in,
4980 (req->splice.flags & SPLICE_F_FD_IN_FIXED));
4981 break;
99bc4c38
PB
4982 }
4983
4984 req->flags &= ~REQ_F_NEED_CLEANUP;
4985}
4986
3529d8c2 4987static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
014db007 4988 bool force_nonblock)
2b188cc1 4989{
a197f664 4990 struct io_ring_ctx *ctx = req->ctx;
d625c6ee 4991 int ret;
2b188cc1 4992
d625c6ee 4993 switch (req->opcode) {
2b188cc1 4994 case IORING_OP_NOP:
78e19bbe 4995 ret = io_nop(req);
2b188cc1
JA
4996 break;
4997 case IORING_OP_READV:
edafccee 4998 case IORING_OP_READ_FIXED:
3a6820f2 4999 case IORING_OP_READ:
3529d8c2
JA
5000 if (sqe) {
5001 ret = io_read_prep(req, sqe, force_nonblock);
5002 if (ret < 0)
5003 break;
5004 }
014db007 5005 ret = io_read(req, force_nonblock);
edafccee 5006 break;
3529d8c2 5007 case IORING_OP_WRITEV:
edafccee 5008 case IORING_OP_WRITE_FIXED:
3a6820f2 5009 case IORING_OP_WRITE:
3529d8c2
JA
5010 if (sqe) {
5011 ret = io_write_prep(req, sqe, force_nonblock);
5012 if (ret < 0)
5013 break;
5014 }
014db007 5015 ret = io_write(req, force_nonblock);
2b188cc1 5016 break;
c992fe29 5017 case IORING_OP_FSYNC:
3529d8c2
JA
5018 if (sqe) {
5019 ret = io_prep_fsync(req, sqe);
5020 if (ret < 0)
5021 break;
5022 }
014db007 5023 ret = io_fsync(req, force_nonblock);
c992fe29 5024 break;
221c5eb2 5025 case IORING_OP_POLL_ADD:
3529d8c2
JA
5026 if (sqe) {
5027 ret = io_poll_add_prep(req, sqe);
5028 if (ret)
5029 break;
5030 }
014db007 5031 ret = io_poll_add(req);
221c5eb2
JA
5032 break;
5033 case IORING_OP_POLL_REMOVE:
3529d8c2
JA
5034 if (sqe) {
5035 ret = io_poll_remove_prep(req, sqe);
5036 if (ret < 0)
5037 break;
5038 }
fc4df999 5039 ret = io_poll_remove(req);
221c5eb2 5040 break;
5d17b4a4 5041 case IORING_OP_SYNC_FILE_RANGE:
3529d8c2
JA
5042 if (sqe) {
5043 ret = io_prep_sfr(req, sqe);
5044 if (ret < 0)
5045 break;
5046 }
014db007 5047 ret = io_sync_file_range(req, force_nonblock);
5d17b4a4 5048 break;
0fa03c62 5049 case IORING_OP_SENDMSG:
fddaface 5050 case IORING_OP_SEND:
3529d8c2
JA
5051 if (sqe) {
5052 ret = io_sendmsg_prep(req, sqe);
5053 if (ret < 0)
5054 break;
5055 }
fddaface 5056 if (req->opcode == IORING_OP_SENDMSG)
014db007 5057 ret = io_sendmsg(req, force_nonblock);
fddaface 5058 else
014db007 5059 ret = io_send(req, force_nonblock);
0fa03c62 5060 break;
aa1fa28f 5061 case IORING_OP_RECVMSG:
fddaface 5062 case IORING_OP_RECV:
3529d8c2
JA
5063 if (sqe) {
5064 ret = io_recvmsg_prep(req, sqe);
5065 if (ret)
5066 break;
5067 }
fddaface 5068 if (req->opcode == IORING_OP_RECVMSG)
014db007 5069 ret = io_recvmsg(req, force_nonblock);
fddaface 5070 else
014db007 5071 ret = io_recv(req, force_nonblock);
aa1fa28f 5072 break;
5262f567 5073 case IORING_OP_TIMEOUT:
3529d8c2
JA
5074 if (sqe) {
5075 ret = io_timeout_prep(req, sqe, false);
5076 if (ret)
5077 break;
5078 }
fc4df999 5079 ret = io_timeout(req);
5262f567 5080 break;
11365043 5081 case IORING_OP_TIMEOUT_REMOVE:
3529d8c2
JA
5082 if (sqe) {
5083 ret = io_timeout_remove_prep(req, sqe);
5084 if (ret)
5085 break;
5086 }
fc4df999 5087 ret = io_timeout_remove(req);
11365043 5088 break;
17f2fe35 5089 case IORING_OP_ACCEPT:
3529d8c2
JA
5090 if (sqe) {
5091 ret = io_accept_prep(req, sqe);
5092 if (ret)
5093 break;
5094 }
014db007 5095 ret = io_accept(req, force_nonblock);
17f2fe35 5096 break;
f8e85cf2 5097 case IORING_OP_CONNECT:
3529d8c2
JA
5098 if (sqe) {
5099 ret = io_connect_prep(req, sqe);
5100 if (ret)
5101 break;
5102 }
014db007 5103 ret = io_connect(req, force_nonblock);
f8e85cf2 5104 break;
62755e35 5105 case IORING_OP_ASYNC_CANCEL:
3529d8c2
JA
5106 if (sqe) {
5107 ret = io_async_cancel_prep(req, sqe);
5108 if (ret)
5109 break;
5110 }
014db007 5111 ret = io_async_cancel(req);
62755e35 5112 break;
d63d1b5e
JA
5113 case IORING_OP_FALLOCATE:
5114 if (sqe) {
5115 ret = io_fallocate_prep(req, sqe);
5116 if (ret)
5117 break;
5118 }
014db007 5119 ret = io_fallocate(req, force_nonblock);
d63d1b5e 5120 break;
15b71abe
JA
5121 case IORING_OP_OPENAT:
5122 if (sqe) {
5123 ret = io_openat_prep(req, sqe);
5124 if (ret)
5125 break;
5126 }
014db007 5127 ret = io_openat(req, force_nonblock);
15b71abe 5128 break;
b5dba59e
JA
5129 case IORING_OP_CLOSE:
5130 if (sqe) {
5131 ret = io_close_prep(req, sqe);
5132 if (ret)
5133 break;
5134 }
014db007 5135 ret = io_close(req, force_nonblock);
b5dba59e 5136 break;
05f3fb3c
JA
5137 case IORING_OP_FILES_UPDATE:
5138 if (sqe) {
5139 ret = io_files_update_prep(req, sqe);
5140 if (ret)
5141 break;
5142 }
5143 ret = io_files_update(req, force_nonblock);
5144 break;
eddc7ef5
JA
5145 case IORING_OP_STATX:
5146 if (sqe) {
5147 ret = io_statx_prep(req, sqe);
5148 if (ret)
5149 break;
5150 }
014db007 5151 ret = io_statx(req, force_nonblock);
eddc7ef5 5152 break;
4840e418
JA
5153 case IORING_OP_FADVISE:
5154 if (sqe) {
5155 ret = io_fadvise_prep(req, sqe);
5156 if (ret)
5157 break;
5158 }
014db007 5159 ret = io_fadvise(req, force_nonblock);
4840e418 5160 break;
c1ca757b
JA
5161 case IORING_OP_MADVISE:
5162 if (sqe) {
5163 ret = io_madvise_prep(req, sqe);
5164 if (ret)
5165 break;
5166 }
014db007 5167 ret = io_madvise(req, force_nonblock);
c1ca757b 5168 break;
cebdb986
JA
5169 case IORING_OP_OPENAT2:
5170 if (sqe) {
5171 ret = io_openat2_prep(req, sqe);
5172 if (ret)
5173 break;
5174 }
014db007 5175 ret = io_openat2(req, force_nonblock);
cebdb986 5176 break;
3e4827b0
JA
5177 case IORING_OP_EPOLL_CTL:
5178 if (sqe) {
5179 ret = io_epoll_ctl_prep(req, sqe);
5180 if (ret)
5181 break;
5182 }
014db007 5183 ret = io_epoll_ctl(req, force_nonblock);
3e4827b0 5184 break;
7d67af2c
PB
5185 case IORING_OP_SPLICE:
5186 if (sqe) {
5187 ret = io_splice_prep(req, sqe);
5188 if (ret < 0)
5189 break;
5190 }
014db007 5191 ret = io_splice(req, force_nonblock);
7d67af2c 5192 break;
ddf0322d
JA
5193 case IORING_OP_PROVIDE_BUFFERS:
5194 if (sqe) {
5195 ret = io_provide_buffers_prep(req, sqe);
5196 if (ret)
5197 break;
5198 }
5199 ret = io_provide_buffers(req, force_nonblock);
5200 break;
067524e9
JA
5201 case IORING_OP_REMOVE_BUFFERS:
5202 if (sqe) {
5203 ret = io_remove_buffers_prep(req, sqe);
5204 if (ret)
5205 break;
5206 }
5207 ret = io_remove_buffers(req, force_nonblock);
5208 break;
2b188cc1
JA
5209 default:
5210 ret = -EINVAL;
5211 break;
5212 }
5213
def596e9
JA
5214 if (ret)
5215 return ret;
5216
5217 if (ctx->flags & IORING_SETUP_IOPOLL) {
11ba820b
JA
5218 const bool in_async = io_wq_current_is_worker();
5219
9e645e11 5220 if (req->result == -EAGAIN)
def596e9
JA
5221 return -EAGAIN;
5222
11ba820b
JA
5223 /* workqueue context doesn't hold uring_lock, grab it now */
5224 if (in_async)
5225 mutex_lock(&ctx->uring_lock);
5226
def596e9 5227 io_iopoll_req_issued(req);
11ba820b
JA
5228
5229 if (in_async)
5230 mutex_unlock(&ctx->uring_lock);
def596e9
JA
5231 }
5232
5233 return 0;
2b188cc1
JA
5234}
5235
561fb04a 5236static void io_wq_submit_work(struct io_wq_work **workptr)
2b188cc1 5237{
561fb04a 5238 struct io_wq_work *work = *workptr;
2b188cc1 5239 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
561fb04a 5240 int ret = 0;
2b188cc1 5241
0c9d5ccd
JA
5242 /* if NO_CANCEL is set, we must still run the work */
5243 if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
5244 IO_WQ_WORK_CANCEL) {
561fb04a 5245 ret = -ECANCELED;
0c9d5ccd 5246 }
31b51510 5247
561fb04a 5248 if (!ret) {
561fb04a 5249 do {
014db007 5250 ret = io_issue_sqe(req, NULL, false);
561fb04a
JA
5251 /*
5252 * We can get EAGAIN for polled IO even though we're
5253 * forcing a sync submission from here, since we can't
5254 * wait for request slots on the block side.
5255 */
5256 if (ret != -EAGAIN)
5257 break;
5258 cond_resched();
5259 } while (1);
5260 }
31b51510 5261
561fb04a 5262 if (ret) {
4e88d6e7 5263 req_set_fail_links(req);
78e19bbe 5264 io_cqring_add_event(req, ret);
817869d2 5265 io_put_req(req);
edafccee 5266 }
2b188cc1 5267
e9fd9396 5268 io_steal_work(req, workptr);
2b188cc1
JA
5269}
5270
15b71abe 5271static int io_req_needs_file(struct io_kiocb *req, int fd)
9e3aa61a 5272{
d3656344 5273 if (!io_op_defs[req->opcode].needs_file)
9e3aa61a 5274 return 0;
0b5faf6b 5275 if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg)
d3656344
JA
5276 return 0;
5277 return 1;
09bb8394
JA
5278}
5279
65e19f54
JA
5280static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
5281 int index)
5282{
5283 struct fixed_file_table *table;
5284
05f3fb3c
JA
5285 table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
5286 return table->files[index & IORING_FILE_TABLE_MASK];;
65e19f54
JA
5287}
5288
8da11c19
PB
5289static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
5290 int fd, struct file **out_file, bool fixed)
09bb8394 5291{
a197f664 5292 struct io_ring_ctx *ctx = req->ctx;
8da11c19 5293 struct file *file;
09bb8394 5294
8da11c19 5295 if (fixed) {
05f3fb3c 5296 if (unlikely(!ctx->file_data ||
09bb8394
JA
5297 (unsigned) fd >= ctx->nr_user_files))
5298 return -EBADF;
b7620121 5299 fd = array_index_nospec(fd, ctx->nr_user_files);
8da11c19
PB
5300 file = io_file_from_index(ctx, fd);
5301 if (!file)
08a45173 5302 return -EBADF;
05f3fb3c 5303 percpu_ref_get(&ctx->file_data->refs);
09bb8394 5304 } else {
c826bd7a 5305 trace_io_uring_file_get(ctx, fd);
8da11c19
PB
5306 file = __io_file_get(state, fd);
5307 if (unlikely(!file))
09bb8394
JA
5308 return -EBADF;
5309 }
5310
8da11c19 5311 *out_file = file;
09bb8394
JA
5312 return 0;
5313}
5314
8da11c19
PB
5315static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
5316 const struct io_uring_sqe *sqe)
5317{
5318 unsigned flags;
5319 int fd;
5320 bool fixed;
5321
5322 flags = READ_ONCE(sqe->flags);
5323 fd = READ_ONCE(sqe->fd);
5324
5325 if (!io_req_needs_file(req, fd))
5326 return 0;
5327
5328 fixed = (flags & IOSQE_FIXED_FILE);
5329 if (unlikely(!fixed && req->needs_fixed_file))
5330 return -EBADF;
5331
5332 return io_file_get(state, req, fd, &req->file, fixed);
5333}
5334
a197f664 5335static int io_grab_files(struct io_kiocb *req)
fcb323cc
JA
5336{
5337 int ret = -EBADF;
a197f664 5338 struct io_ring_ctx *ctx = req->ctx;
fcb323cc 5339
f86cd20c
JA
5340 if (req->work.files)
5341 return 0;
b14cca0c 5342 if (!ctx->ring_file)
b5dba59e
JA
5343 return -EBADF;
5344
fcb323cc
JA
5345 rcu_read_lock();
5346 spin_lock_irq(&ctx->inflight_lock);
5347 /*
5348 * We use the f_ops->flush() handler to ensure that we can flush
5349 * out work accessing these files if the fd is closed. Check if
5350 * the fd has changed since we started down this path, and disallow
5351 * this operation if it has.
5352 */
b14cca0c 5353 if (fcheck(ctx->ring_fd) == ctx->ring_file) {
fcb323cc
JA
5354 list_add(&req->inflight_entry, &ctx->inflight_list);
5355 req->flags |= REQ_F_INFLIGHT;
5356 req->work.files = current->files;
5357 ret = 0;
5358 }
5359 spin_unlock_irq(&ctx->inflight_lock);
5360 rcu_read_unlock();
5361
5362 return ret;
5363}
5364
2665abfd 5365static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 5366{
ad8a48ac
JA
5367 struct io_timeout_data *data = container_of(timer,
5368 struct io_timeout_data, timer);
5369 struct io_kiocb *req = data->req;
2665abfd
JA
5370 struct io_ring_ctx *ctx = req->ctx;
5371 struct io_kiocb *prev = NULL;
5372 unsigned long flags;
2665abfd
JA
5373
5374 spin_lock_irqsave(&ctx->completion_lock, flags);
5375
5376 /*
5377 * We don't expect the list to be empty, that will only happen if we
5378 * race with the completion of the linked work.
5379 */
4493233e
PB
5380 if (!list_empty(&req->link_list)) {
5381 prev = list_entry(req->link_list.prev, struct io_kiocb,
5382 link_list);
5d960724 5383 if (refcount_inc_not_zero(&prev->refs)) {
4493233e 5384 list_del_init(&req->link_list);
5d960724
JA
5385 prev->flags &= ~REQ_F_LINK_TIMEOUT;
5386 } else
76a46e06 5387 prev = NULL;
2665abfd
JA
5388 }
5389
5390 spin_unlock_irqrestore(&ctx->completion_lock, flags);
5391
5392 if (prev) {
4e88d6e7 5393 req_set_fail_links(prev);
014db007 5394 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
76a46e06 5395 io_put_req(prev);
47f46768
JA
5396 } else {
5397 io_cqring_add_event(req, -ETIME);
5398 io_put_req(req);
2665abfd 5399 }
2665abfd
JA
5400 return HRTIMER_NORESTART;
5401}
5402
ad8a48ac 5403static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 5404{
76a46e06 5405 struct io_ring_ctx *ctx = req->ctx;
2665abfd 5406
76a46e06
JA
5407 /*
5408 * If the list is now empty, then our linked request finished before
5409 * we got a chance to setup the timer
5410 */
5411 spin_lock_irq(&ctx->completion_lock);
4493233e 5412 if (!list_empty(&req->link_list)) {
2d28390a 5413 struct io_timeout_data *data = &req->io->timeout;
94ae5e77 5414
ad8a48ac
JA
5415 data->timer.function = io_link_timeout_fn;
5416 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
5417 data->mode);
2665abfd 5418 }
76a46e06 5419 spin_unlock_irq(&ctx->completion_lock);
2665abfd 5420
2665abfd 5421 /* drop submission reference */
76a46e06
JA
5422 io_put_req(req);
5423}
2665abfd 5424
ad8a48ac 5425static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
2665abfd
JA
5426{
5427 struct io_kiocb *nxt;
5428
5429 if (!(req->flags & REQ_F_LINK))
5430 return NULL;
d7718a9d
JA
5431 /* for polled retry, if flag is set, we already went through here */
5432 if (req->flags & REQ_F_POLLED)
5433 return NULL;
2665abfd 5434
4493233e
PB
5435 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
5436 link_list);
d625c6ee 5437 if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
76a46e06 5438 return NULL;
2665abfd 5439
76a46e06 5440 req->flags |= REQ_F_LINK_TIMEOUT;
76a46e06 5441 return nxt;
2665abfd
JA
5442}
5443
3529d8c2 5444static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 5445{
4a0a7a18 5446 struct io_kiocb *linked_timeout;
4bc4494e 5447 struct io_kiocb *nxt;
193155c8 5448 const struct cred *old_creds = NULL;
e0c5c576 5449 int ret;
2b188cc1 5450
4a0a7a18
JA
5451again:
5452 linked_timeout = io_prep_linked_timeout(req);
5453
193155c8
JA
5454 if (req->work.creds && req->work.creds != current_cred()) {
5455 if (old_creds)
5456 revert_creds(old_creds);
5457 if (old_creds == req->work.creds)
5458 old_creds = NULL; /* restored original creds */
5459 else
5460 old_creds = override_creds(req->work.creds);
5461 }
5462
014db007 5463 ret = io_issue_sqe(req, sqe, true);
491381ce
JA
5464
5465 /*
5466 * We async punt it if the file wasn't marked NOWAIT, or if the file
5467 * doesn't support non-blocking read/write attempts
5468 */
5469 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
5470 (req->flags & REQ_F_MUST_PUNT))) {
d7718a9d
JA
5471 if (io_arm_poll_handler(req)) {
5472 if (linked_timeout)
5473 io_queue_linked_timeout(linked_timeout);
4bc4494e 5474 goto exit;
d7718a9d 5475 }
86a761f8 5476punt:
f86cd20c 5477 if (io_op_defs[req->opcode].file_table) {
bbad27b2
PB
5478 ret = io_grab_files(req);
5479 if (ret)
5480 goto err;
2b188cc1 5481 }
bbad27b2
PB
5482
5483 /*
5484 * Queued up for async execution, worker will release
5485 * submit reference when the iocb is actually submitted.
5486 */
5487 io_queue_async_work(req);
4bc4494e 5488 goto exit;
2b188cc1 5489 }
e65ef56d 5490
fcb323cc 5491err:
4bc4494e 5492 nxt = NULL;
76a46e06 5493 /* drop submission reference */
2a44f467 5494 io_put_req_find_next(req, &nxt);
e65ef56d 5495
f9bd67f6 5496 if (linked_timeout) {
76a46e06 5497 if (!ret)
f9bd67f6 5498 io_queue_linked_timeout(linked_timeout);
76a46e06 5499 else
f9bd67f6 5500 io_put_req(linked_timeout);
76a46e06
JA
5501 }
5502
e65ef56d 5503 /* and drop final reference, if we failed */
9e645e11 5504 if (ret) {
78e19bbe 5505 io_cqring_add_event(req, ret);
4e88d6e7 5506 req_set_fail_links(req);
e65ef56d 5507 io_put_req(req);
9e645e11 5508 }
4a0a7a18
JA
5509 if (nxt) {
5510 req = nxt;
86a761f8
PB
5511
5512 if (req->flags & REQ_F_FORCE_ASYNC)
5513 goto punt;
4a0a7a18
JA
5514 goto again;
5515 }
4bc4494e 5516exit:
193155c8
JA
5517 if (old_creds)
5518 revert_creds(old_creds);
2b188cc1
JA
5519}
5520
3529d8c2 5521static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4fe2c963
JL
5522{
5523 int ret;
5524
3529d8c2 5525 ret = io_req_defer(req, sqe);
4fe2c963
JL
5526 if (ret) {
5527 if (ret != -EIOCBQUEUED) {
1118591a 5528fail_req:
78e19bbe 5529 io_cqring_add_event(req, ret);
4e88d6e7 5530 req_set_fail_links(req);
78e19bbe 5531 io_double_put_req(req);
4fe2c963 5532 }
2550878f 5533 } else if (req->flags & REQ_F_FORCE_ASYNC) {
1118591a
PB
5534 ret = io_req_defer_prep(req, sqe);
5535 if (unlikely(ret < 0))
5536 goto fail_req;
ce35a47a
JA
5537 /*
5538 * Never try inline submit of IOSQE_ASYNC is set, go straight
5539 * to async execution.
5540 */
5541 req->work.flags |= IO_WQ_WORK_CONCURRENT;
5542 io_queue_async_work(req);
5543 } else {
3529d8c2 5544 __io_queue_sqe(req, sqe);
ce35a47a 5545 }
4fe2c963
JL
5546}
5547
1b4a51b6 5548static inline void io_queue_link_head(struct io_kiocb *req)
4fe2c963 5549{
94ae5e77 5550 if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
1b4a51b6
PB
5551 io_cqring_add_event(req, -ECANCELED);
5552 io_double_put_req(req);
5553 } else
3529d8c2 5554 io_queue_sqe(req, NULL);
4fe2c963
JL
5555}
5556
4e88d6e7 5557#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
bcda7baa
JA
5558 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
5559 IOSQE_BUFFER_SELECT)
9e645e11 5560
3529d8c2
JA
5561static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5562 struct io_submit_state *state, struct io_kiocb **link)
9e645e11 5563{
a197f664 5564 struct io_ring_ctx *ctx = req->ctx;
32fe525b 5565 unsigned int sqe_flags;
75c6a039 5566 int ret, id;
9e645e11 5567
32fe525b 5568 sqe_flags = READ_ONCE(sqe->flags);
9e645e11
JA
5569
5570 /* enforce forwards compatibility on users */
32fe525b 5571 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
9e645e11 5572 ret = -EINVAL;
196be95c 5573 goto err_req;
9e645e11
JA
5574 }
5575
bcda7baa
JA
5576 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
5577 !io_op_defs[req->opcode].buffer_select) {
5578 ret = -EOPNOTSUPP;
5579 goto err_req;
5580 }
5581
75c6a039
JA
5582 id = READ_ONCE(sqe->personality);
5583 if (id) {
193155c8
JA
5584 req->work.creds = idr_find(&ctx->personality_idr, id);
5585 if (unlikely(!req->work.creds)) {
75c6a039
JA
5586 ret = -EINVAL;
5587 goto err_req;
5588 }
193155c8 5589 get_cred(req->work.creds);
75c6a039
JA
5590 }
5591
6b47ee6e 5592 /* same numerical values with corresponding REQ_F_*, safe to copy */
8da11c19 5593 req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
bcda7baa
JA
5594 IOSQE_ASYNC | IOSQE_FIXED_FILE |
5595 IOSQE_BUFFER_SELECT);
9e645e11 5596
3529d8c2 5597 ret = io_req_set_file(state, req, sqe);
9e645e11
JA
5598 if (unlikely(ret)) {
5599err_req:
78e19bbe
JA
5600 io_cqring_add_event(req, ret);
5601 io_double_put_req(req);
2e6e1fde 5602 return false;
9e645e11
JA
5603 }
5604
9e645e11
JA
5605 /*
5606 * If we already have a head request, queue this one for async
5607 * submittal once the head completes. If we don't have a head but
5608 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
5609 * submitted sync once the chain is complete. If none of those
5610 * conditions are true (normal request), then just queue it.
5611 */
5612 if (*link) {
9d76377f 5613 struct io_kiocb *head = *link;
4e88d6e7 5614
8cdf2193
PB
5615 /*
5616 * Taking sequential execution of a link, draining both sides
5617 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
5618 * requests in the link. So, it drains the head and the
5619 * next after the link request. The last one is done via
5620 * drain_next flag to persist the effect across calls.
5621 */
711be031
PB
5622 if (sqe_flags & IOSQE_IO_DRAIN) {
5623 head->flags |= REQ_F_IO_DRAIN;
5624 ctx->drain_next = 1;
5625 }
b7bb4f7d 5626 if (io_alloc_async_ctx(req)) {
9e645e11
JA
5627 ret = -EAGAIN;
5628 goto err_req;
5629 }
5630
3529d8c2 5631 ret = io_req_defer_prep(req, sqe);
2d28390a 5632 if (ret) {
4e88d6e7 5633 /* fail even hard links since we don't submit */
9d76377f 5634 head->flags |= REQ_F_FAIL_LINK;
f67676d1 5635 goto err_req;
2d28390a 5636 }
9d76377f
PB
5637 trace_io_uring_link(ctx, req, head);
5638 list_add_tail(&req->link_list, &head->link_list);
32fe525b
PB
5639
5640 /* last request of a link, enqueue the link */
5641 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
5642 io_queue_link_head(head);
5643 *link = NULL;
5644 }
9e645e11 5645 } else {
711be031
PB
5646 if (unlikely(ctx->drain_next)) {
5647 req->flags |= REQ_F_IO_DRAIN;
5648 req->ctx->drain_next = 0;
5649 }
5650 if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
5651 req->flags |= REQ_F_LINK;
711be031
PB
5652 INIT_LIST_HEAD(&req->link_list);
5653 ret = io_req_defer_prep(req, sqe);
5654 if (ret)
5655 req->flags |= REQ_F_FAIL_LINK;
5656 *link = req;
5657 } else {
5658 io_queue_sqe(req, sqe);
5659 }
9e645e11 5660 }
2e6e1fde
PB
5661
5662 return true;
9e645e11
JA
5663}
5664
9a56a232
JA
5665/*
5666 * Batched submission is done, ensure local IO is flushed out.
5667 */
5668static void io_submit_state_end(struct io_submit_state *state)
5669{
5670 blk_finish_plug(&state->plug);
3d6770fb 5671 io_file_put(state);
2579f913 5672 if (state->free_reqs)
6c8a3134 5673 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
9a56a232
JA
5674}
5675
5676/*
5677 * Start submission side cache.
5678 */
5679static void io_submit_state_start(struct io_submit_state *state,
22efde59 5680 unsigned int max_ios)
9a56a232
JA
5681{
5682 blk_start_plug(&state->plug);
2579f913 5683 state->free_reqs = 0;
9a56a232
JA
5684 state->file = NULL;
5685 state->ios_left = max_ios;
5686}
5687
2b188cc1
JA
5688static void io_commit_sqring(struct io_ring_ctx *ctx)
5689{
75b28aff 5690 struct io_rings *rings = ctx->rings;
2b188cc1 5691
caf582c6
PB
5692 /*
5693 * Ensure any loads from the SQEs are done at this point,
5694 * since once we write the new head, the application could
5695 * write new data to them.
5696 */
5697 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
5698}
5699
2b188cc1 5700/*
3529d8c2 5701 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
2b188cc1
JA
5702 * that is mapped by userspace. This means that care needs to be taken to
5703 * ensure that reads are stable, as we cannot rely on userspace always
5704 * being a good citizen. If members of the sqe are validated and then later
5705 * used, it's important that those reads are done through READ_ONCE() to
5706 * prevent a re-load down the line.
5707 */
3529d8c2
JA
5708static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
5709 const struct io_uring_sqe **sqe_ptr)
2b188cc1 5710{
75b28aff 5711 u32 *sq_array = ctx->sq_array;
2b188cc1
JA
5712 unsigned head;
5713
5714 /*
5715 * The cached sq head (or cq tail) serves two purposes:
5716 *
5717 * 1) allows us to batch the cost of updating the user visible
5718 * head updates.
5719 * 2) allows the kernel side to track the head on its own, even
5720 * though the application is the one updating it.
5721 */
ee7d46d9 5722 head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
9835d6fa 5723 if (likely(head < ctx->sq_entries)) {
cf6fd4bd
PB
5724 /*
5725 * All io need record the previous position, if LINK vs DARIN,
5726 * it can be used to mark the position of the first IO in the
5727 * link list.
5728 */
5729 req->sequence = ctx->cached_sq_head;
3529d8c2
JA
5730 *sqe_ptr = &ctx->sq_sqes[head];
5731 req->opcode = READ_ONCE((*sqe_ptr)->opcode);
5732 req->user_data = READ_ONCE((*sqe_ptr)->user_data);
2b188cc1
JA
5733 ctx->cached_sq_head++;
5734 return true;
5735 }
5736
5737 /* drop invalid entries */
5738 ctx->cached_sq_head++;
498ccd9e 5739 ctx->cached_sq_dropped++;
ee7d46d9 5740 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
2b188cc1
JA
5741 return false;
5742}
5743
fb5ccc98 5744static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
ae9428ca
PB
5745 struct file *ring_file, int ring_fd,
5746 struct mm_struct **mm, bool async)
6c271ce2
JA
5747{
5748 struct io_submit_state state, *statep = NULL;
9e645e11 5749 struct io_kiocb *link = NULL;
9e645e11 5750 int i, submitted = 0;
95a1b3ff 5751 bool mm_fault = false;
6c271ce2 5752
c4a2ed72 5753 /* if we have a backlog and couldn't flush it all, return BUSY */
ad3eb2c8
JA
5754 if (test_bit(0, &ctx->sq_check_overflow)) {
5755 if (!list_empty(&ctx->cq_overflow_list) &&
5756 !io_cqring_overflow_flush(ctx, false))
5757 return -EBUSY;
5758 }
6c271ce2 5759
ee7d46d9
PB
5760 /* make sure SQ entry isn't read before tail */
5761 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
9ef4f124 5762
2b85edfc
PB
5763 if (!percpu_ref_tryget_many(&ctx->refs, nr))
5764 return -EAGAIN;
6c271ce2
JA
5765
5766 if (nr > IO_PLUG_THRESHOLD) {
22efde59 5767 io_submit_state_start(&state, nr);
6c271ce2
JA
5768 statep = &state;
5769 }
5770
b14cca0c
PB
5771 ctx->ring_fd = ring_fd;
5772 ctx->ring_file = ring_file;
5773
6c271ce2 5774 for (i = 0; i < nr; i++) {
3529d8c2 5775 const struct io_uring_sqe *sqe;
196be95c 5776 struct io_kiocb *req;
1cb1edb2 5777 int err;
fb5ccc98 5778
196be95c
PB
5779 req = io_get_req(ctx, statep);
5780 if (unlikely(!req)) {
5781 if (!submitted)
5782 submitted = -EAGAIN;
fb5ccc98 5783 break;
196be95c 5784 }
3529d8c2 5785 if (!io_get_sqring(ctx, req, &sqe)) {
2b85edfc 5786 __io_req_do_free(req);
196be95c
PB
5787 break;
5788 }
fb5ccc98 5789
d3656344
JA
5790 /* will complete beyond this point, count as submitted */
5791 submitted++;
5792
5793 if (unlikely(req->opcode >= IORING_OP_LAST)) {
1cb1edb2
PB
5794 err = -EINVAL;
5795fail_req:
5796 io_cqring_add_event(req, err);
d3656344 5797 io_double_put_req(req);
196be95c
PB
5798 break;
5799 }
fb5ccc98 5800
d3656344 5801 if (io_op_defs[req->opcode].needs_mm && !*mm) {
95a1b3ff 5802 mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
1cb1edb2
PB
5803 if (unlikely(mm_fault)) {
5804 err = -EFAULT;
5805 goto fail_req;
95a1b3ff 5806 }
1cb1edb2
PB
5807 use_mm(ctx->sqo_mm);
5808 *mm = ctx->sqo_mm;
9e645e11 5809 }
9e645e11 5810
cf6fd4bd 5811 req->needs_fixed_file = async;
354420f7
JA
5812 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
5813 true, async);
3529d8c2 5814 if (!io_submit_sqe(req, sqe, statep, &link))
2e6e1fde 5815 break;
6c271ce2
JA
5816 }
5817
9466f437
PB
5818 if (unlikely(submitted != nr)) {
5819 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
5820
5821 percpu_ref_put_many(&ctx->refs, nr - ref_used);
5822 }
9e645e11 5823 if (link)
1b4a51b6 5824 io_queue_link_head(link);
6c271ce2
JA
5825 if (statep)
5826 io_submit_state_end(&state);
5827
ae9428ca
PB
5828 /* Commit SQ ring head once we've consumed and submitted all SQEs */
5829 io_commit_sqring(ctx);
5830
6c271ce2
JA
5831 return submitted;
5832}
5833
5834static int io_sq_thread(void *data)
5835{
6c271ce2
JA
5836 struct io_ring_ctx *ctx = data;
5837 struct mm_struct *cur_mm = NULL;
181e448d 5838 const struct cred *old_cred;
6c271ce2
JA
5839 mm_segment_t old_fs;
5840 DEFINE_WAIT(wait);
6c271ce2 5841 unsigned long timeout;
bdcd3eab 5842 int ret = 0;
6c271ce2 5843
206aefde 5844 complete(&ctx->completions[1]);
a4c0b3de 5845
6c271ce2
JA
5846 old_fs = get_fs();
5847 set_fs(USER_DS);
181e448d 5848 old_cred = override_creds(ctx->creds);
6c271ce2 5849
bdcd3eab 5850 timeout = jiffies + ctx->sq_thread_idle;
2bbcd6d3 5851 while (!kthread_should_park()) {
fb5ccc98 5852 unsigned int to_submit;
6c271ce2 5853
bdcd3eab 5854 if (!list_empty(&ctx->poll_list)) {
6c271ce2
JA
5855 unsigned nr_events = 0;
5856
bdcd3eab
XW
5857 mutex_lock(&ctx->uring_lock);
5858 if (!list_empty(&ctx->poll_list))
5859 io_iopoll_getevents(ctx, &nr_events, 0);
5860 else
6c271ce2 5861 timeout = jiffies + ctx->sq_thread_idle;
bdcd3eab 5862 mutex_unlock(&ctx->uring_lock);
6c271ce2
JA
5863 }
5864
fb5ccc98 5865 to_submit = io_sqring_entries(ctx);
c1edbf5f
JA
5866
5867 /*
5868 * If submit got -EBUSY, flag us as needing the application
5869 * to enter the kernel to reap and flush events.
5870 */
5871 if (!to_submit || ret == -EBUSY) {
7143b5ac
SG
5872 /*
5873 * Drop cur_mm before scheduling, we can't hold it for
5874 * long periods (or over schedule()). Do this before
5875 * adding ourselves to the waitqueue, as the unuse/drop
5876 * may sleep.
5877 */
5878 if (cur_mm) {
5879 unuse_mm(cur_mm);
5880 mmput(cur_mm);
5881 cur_mm = NULL;
5882 }
5883
6c271ce2
JA
5884 /*
5885 * We're polling. If we're within the defined idle
5886 * period, then let us spin without work before going
c1edbf5f
JA
5887 * to sleep. The exception is if we got EBUSY doing
5888 * more IO, we should wait for the application to
5889 * reap events and wake us up.
6c271ce2 5890 */
bdcd3eab 5891 if (!list_empty(&ctx->poll_list) ||
df069d80
JA
5892 (!time_after(jiffies, timeout) && ret != -EBUSY &&
5893 !percpu_ref_is_dying(&ctx->refs))) {
b41e9852
JA
5894 if (current->task_works)
5895 task_work_run();
9831a90c 5896 cond_resched();
6c271ce2
JA
5897 continue;
5898 }
5899
6c271ce2
JA
5900 prepare_to_wait(&ctx->sqo_wait, &wait,
5901 TASK_INTERRUPTIBLE);
5902
bdcd3eab
XW
5903 /*
5904 * While doing polled IO, before going to sleep, we need
5905 * to check if there are new reqs added to poll_list, it
5906 * is because reqs may have been punted to io worker and
5907 * will be added to poll_list later, hence check the
5908 * poll_list again.
5909 */
5910 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
5911 !list_empty_careful(&ctx->poll_list)) {
5912 finish_wait(&ctx->sqo_wait, &wait);
5913 continue;
5914 }
5915
6c271ce2 5916 /* Tell userspace we may need a wakeup call */
75b28aff 5917 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
0d7bae69
SB
5918 /* make sure to read SQ tail after writing flags */
5919 smp_mb();
6c271ce2 5920
fb5ccc98 5921 to_submit = io_sqring_entries(ctx);
c1edbf5f 5922 if (!to_submit || ret == -EBUSY) {
2bbcd6d3 5923 if (kthread_should_park()) {
6c271ce2
JA
5924 finish_wait(&ctx->sqo_wait, &wait);
5925 break;
5926 }
b41e9852
JA
5927 if (current->task_works) {
5928 task_work_run();
5929 continue;
5930 }
6c271ce2
JA
5931 if (signal_pending(current))
5932 flush_signals(current);
5933 schedule();
5934 finish_wait(&ctx->sqo_wait, &wait);
5935
75b28aff 5936 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
5937 continue;
5938 }
5939 finish_wait(&ctx->sqo_wait, &wait);
5940
75b28aff 5941 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6c271ce2
JA
5942 }
5943
8a4955ff 5944 mutex_lock(&ctx->uring_lock);
1d7bb1d5 5945 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
8a4955ff 5946 mutex_unlock(&ctx->uring_lock);
bdcd3eab 5947 timeout = jiffies + ctx->sq_thread_idle;
6c271ce2
JA
5948 }
5949
b41e9852
JA
5950 if (current->task_works)
5951 task_work_run();
5952
6c271ce2
JA
5953 set_fs(old_fs);
5954 if (cur_mm) {
5955 unuse_mm(cur_mm);
5956 mmput(cur_mm);
5957 }
181e448d 5958 revert_creds(old_cred);
06058632 5959
2bbcd6d3 5960 kthread_parkme();
06058632 5961
6c271ce2
JA
5962 return 0;
5963}
5964
bda52162
JA
5965struct io_wait_queue {
5966 struct wait_queue_entry wq;
5967 struct io_ring_ctx *ctx;
5968 unsigned to_wait;
5969 unsigned nr_timeouts;
5970};
5971
1d7bb1d5 5972static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
bda52162
JA
5973{
5974 struct io_ring_ctx *ctx = iowq->ctx;
5975
5976 /*
d195a66e 5977 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
5978 * started waiting. For timeouts, we always want to return to userspace,
5979 * regardless of event count.
5980 */
1d7bb1d5 5981 return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
bda52162
JA
5982 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
5983}
5984
5985static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
5986 int wake_flags, void *key)
5987{
5988 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
5989 wq);
5990
1d7bb1d5
JA
5991 /* use noflush == true, as we can't safely rely on locking context */
5992 if (!io_should_wake(iowq, true))
bda52162
JA
5993 return -1;
5994
5995 return autoremove_wake_function(curr, mode, wake_flags, key);
5996}
5997
2b188cc1
JA
5998/*
5999 * Wait until events become available, if we don't already have some. The
6000 * application must reap them itself, as they reside on the shared cq ring.
6001 */
6002static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6003 const sigset_t __user *sig, size_t sigsz)
6004{
bda52162
JA
6005 struct io_wait_queue iowq = {
6006 .wq = {
6007 .private = current,
6008 .func = io_wake_function,
6009 .entry = LIST_HEAD_INIT(iowq.wq.entry),
6010 },
6011 .ctx = ctx,
6012 .to_wait = min_events,
6013 };
75b28aff 6014 struct io_rings *rings = ctx->rings;
e9ffa5c2 6015 int ret = 0;
2b188cc1 6016
b41e9852
JA
6017 do {
6018 if (io_cqring_events(ctx, false) >= min_events)
6019 return 0;
6020 if (!current->task_works)
6021 break;
6022 task_work_run();
6023 } while (1);
2b188cc1
JA
6024
6025 if (sig) {
9e75ad5d
AB
6026#ifdef CONFIG_COMPAT
6027 if (in_compat_syscall())
6028 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 6029 sigsz);
9e75ad5d
AB
6030 else
6031#endif
b772434b 6032 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 6033
2b188cc1
JA
6034 if (ret)
6035 return ret;
6036 }
6037
bda52162 6038 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
c826bd7a 6039 trace_io_uring_cqring_wait(ctx, min_events);
bda52162
JA
6040 do {
6041 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6042 TASK_INTERRUPTIBLE);
b41e9852
JA
6043 if (current->task_works)
6044 task_work_run();
1d7bb1d5 6045 if (io_should_wake(&iowq, false))
bda52162
JA
6046 break;
6047 schedule();
6048 if (signal_pending(current)) {
e9ffa5c2 6049 ret = -EINTR;
bda52162
JA
6050 break;
6051 }
6052 } while (1);
6053 finish_wait(&ctx->wait, &iowq.wq);
6054
e9ffa5c2 6055 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 6056
75b28aff 6057 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
6058}
6059
6b06314c
JA
6060static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6061{
6062#if defined(CONFIG_UNIX)
6063 if (ctx->ring_sock) {
6064 struct sock *sock = ctx->ring_sock->sk;
6065 struct sk_buff *skb;
6066
6067 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6068 kfree_skb(skb);
6069 }
6070#else
6071 int i;
6072
65e19f54
JA
6073 for (i = 0; i < ctx->nr_user_files; i++) {
6074 struct file *file;
6075
6076 file = io_file_from_index(ctx, i);
6077 if (file)
6078 fput(file);
6079 }
6b06314c
JA
6080#endif
6081}
6082
05f3fb3c
JA
6083static void io_file_ref_kill(struct percpu_ref *ref)
6084{
6085 struct fixed_file_data *data;
6086
6087 data = container_of(ref, struct fixed_file_data, refs);
6088 complete(&data->done);
6089}
6090
6b06314c
JA
6091static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6092{
05f3fb3c 6093 struct fixed_file_data *data = ctx->file_data;
65e19f54
JA
6094 unsigned nr_tables, i;
6095
05f3fb3c 6096 if (!data)
6b06314c
JA
6097 return -ENXIO;
6098
05f3fb3c 6099 percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
e46a7950 6100 flush_work(&data->ref_work);
2faf852d
JA
6101 wait_for_completion(&data->done);
6102 io_ring_file_ref_flush(data);
05f3fb3c
JA
6103 percpu_ref_exit(&data->refs);
6104
6b06314c 6105 __io_sqe_files_unregister(ctx);
65e19f54
JA
6106 nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
6107 for (i = 0; i < nr_tables; i++)
05f3fb3c
JA
6108 kfree(data->table[i].files);
6109 kfree(data->table);
6110 kfree(data);
6111 ctx->file_data = NULL;
6b06314c
JA
6112 ctx->nr_user_files = 0;
6113 return 0;
6114}
6115
6c271ce2
JA
6116static void io_sq_thread_stop(struct io_ring_ctx *ctx)
6117{
6118 if (ctx->sqo_thread) {
206aefde 6119 wait_for_completion(&ctx->completions[1]);
2bbcd6d3
RP
6120 /*
6121 * The park is a bit of a work-around, without it we get
6122 * warning spews on shutdown with SQPOLL set and affinity
6123 * set to a single CPU.
6124 */
06058632 6125 kthread_park(ctx->sqo_thread);
6c271ce2
JA
6126 kthread_stop(ctx->sqo_thread);
6127 ctx->sqo_thread = NULL;
6128 }
6129}
6130
6b06314c
JA
6131static void io_finish_async(struct io_ring_ctx *ctx)
6132{
6c271ce2
JA
6133 io_sq_thread_stop(ctx);
6134
561fb04a
JA
6135 if (ctx->io_wq) {
6136 io_wq_destroy(ctx->io_wq);
6137 ctx->io_wq = NULL;
6b06314c
JA
6138 }
6139}
6140
6141#if defined(CONFIG_UNIX)
6b06314c
JA
6142/*
6143 * Ensure the UNIX gc is aware of our file set, so we are certain that
6144 * the io_uring can be safely unregistered on process exit, even if we have
6145 * loops in the file referencing.
6146 */
6147static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
6148{
6149 struct sock *sk = ctx->ring_sock->sk;
6150 struct scm_fp_list *fpl;
6151 struct sk_buff *skb;
08a45173 6152 int i, nr_files;
6b06314c
JA
6153
6154 if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
6155 unsigned long inflight = ctx->user->unix_inflight + nr;
6156
6157 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
6158 return -EMFILE;
6159 }
6160
6161 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
6162 if (!fpl)
6163 return -ENOMEM;
6164
6165 skb = alloc_skb(0, GFP_KERNEL);
6166 if (!skb) {
6167 kfree(fpl);
6168 return -ENOMEM;
6169 }
6170
6171 skb->sk = sk;
6b06314c 6172
08a45173 6173 nr_files = 0;
6b06314c
JA
6174 fpl->user = get_uid(ctx->user);
6175 for (i = 0; i < nr; i++) {
65e19f54
JA
6176 struct file *file = io_file_from_index(ctx, i + offset);
6177
6178 if (!file)
08a45173 6179 continue;
65e19f54 6180 fpl->fp[nr_files] = get_file(file);
08a45173
JA
6181 unix_inflight(fpl->user, fpl->fp[nr_files]);
6182 nr_files++;
6b06314c
JA
6183 }
6184
08a45173
JA
6185 if (nr_files) {
6186 fpl->max = SCM_MAX_FD;
6187 fpl->count = nr_files;
6188 UNIXCB(skb).fp = fpl;
05f3fb3c 6189 skb->destructor = unix_destruct_scm;
08a45173
JA
6190 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
6191 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 6192
08a45173
JA
6193 for (i = 0; i < nr_files; i++)
6194 fput(fpl->fp[i]);
6195 } else {
6196 kfree_skb(skb);
6197 kfree(fpl);
6198 }
6b06314c
JA
6199
6200 return 0;
6201}
6202
6203/*
6204 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
6205 * causes regular reference counting to break down. We rely on the UNIX
6206 * garbage collection to take care of this problem for us.
6207 */
6208static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6209{
6210 unsigned left, total;
6211 int ret = 0;
6212
6213 total = 0;
6214 left = ctx->nr_user_files;
6215 while (left) {
6216 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
6217
6218 ret = __io_sqe_files_scm(ctx, this_files, total);
6219 if (ret)
6220 break;
6221 left -= this_files;
6222 total += this_files;
6223 }
6224
6225 if (!ret)
6226 return 0;
6227
6228 while (total < ctx->nr_user_files) {
65e19f54
JA
6229 struct file *file = io_file_from_index(ctx, total);
6230
6231 if (file)
6232 fput(file);
6b06314c
JA
6233 total++;
6234 }
6235
6236 return ret;
6237}
6238#else
6239static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6240{
6241 return 0;
6242}
6243#endif
6244
65e19f54
JA
6245static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
6246 unsigned nr_files)
6247{
6248 int i;
6249
6250 for (i = 0; i < nr_tables; i++) {
05f3fb3c 6251 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
6252 unsigned this_files;
6253
6254 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
6255 table->files = kcalloc(this_files, sizeof(struct file *),
6256 GFP_KERNEL);
6257 if (!table->files)
6258 break;
6259 nr_files -= this_files;
6260 }
6261
6262 if (i == nr_tables)
6263 return 0;
6264
6265 for (i = 0; i < nr_tables; i++) {
05f3fb3c 6266 struct fixed_file_table *table = &ctx->file_data->table[i];
65e19f54
JA
6267 kfree(table->files);
6268 }
6269 return 1;
6270}
6271
05f3fb3c
JA
6272static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
6273{
6274#if defined(CONFIG_UNIX)
6275 struct sock *sock = ctx->ring_sock->sk;
6276 struct sk_buff_head list, *head = &sock->sk_receive_queue;
6277 struct sk_buff *skb;
6278 int i;
6279
6280 __skb_queue_head_init(&list);
6281
6282 /*
6283 * Find the skb that holds this file in its SCM_RIGHTS. When found,
6284 * remove this entry and rearrange the file array.
6285 */
6286 skb = skb_dequeue(head);
6287 while (skb) {
6288 struct scm_fp_list *fp;
6289
6290 fp = UNIXCB(skb).fp;
6291 for (i = 0; i < fp->count; i++) {
6292 int left;
6293
6294 if (fp->fp[i] != file)
6295 continue;
6296
6297 unix_notinflight(fp->user, fp->fp[i]);
6298 left = fp->count - 1 - i;
6299 if (left) {
6300 memmove(&fp->fp[i], &fp->fp[i + 1],
6301 left * sizeof(struct file *));
6302 }
6303 fp->count--;
6304 if (!fp->count) {
6305 kfree_skb(skb);
6306 skb = NULL;
6307 } else {
6308 __skb_queue_tail(&list, skb);
6309 }
6310 fput(file);
6311 file = NULL;
6312 break;
6313 }
6314
6315 if (!file)
6316 break;
6317
6318 __skb_queue_tail(&list, skb);
6319
6320 skb = skb_dequeue(head);
6321 }
6322
6323 if (skb_peek(&list)) {
6324 spin_lock_irq(&head->lock);
6325 while ((skb = __skb_dequeue(&list)) != NULL)
6326 __skb_queue_tail(head, skb);
6327 spin_unlock_irq(&head->lock);
6328 }
6329#else
6330 fput(file);
6331#endif
6332}
6333
6334struct io_file_put {
6335 struct llist_node llist;
6336 struct file *file;
6337 struct completion *done;
6338};
6339
2faf852d 6340static void io_ring_file_ref_flush(struct fixed_file_data *data)
65e19f54 6341{
05f3fb3c 6342 struct io_file_put *pfile, *tmp;
05f3fb3c 6343 struct llist_node *node;
65e19f54 6344
05f3fb3c
JA
6345 while ((node = llist_del_all(&data->put_llist)) != NULL) {
6346 llist_for_each_entry_safe(pfile, tmp, node, llist) {
6347 io_ring_file_put(data->ctx, pfile->file);
6348 if (pfile->done)
6349 complete(pfile->done);
6350 else
6351 kfree(pfile);
6352 }
65e19f54 6353 }
2faf852d 6354}
65e19f54 6355
2faf852d
JA
6356static void io_ring_file_ref_switch(struct work_struct *work)
6357{
6358 struct fixed_file_data *data;
65e19f54 6359
2faf852d
JA
6360 data = container_of(work, struct fixed_file_data, ref_work);
6361 io_ring_file_ref_flush(data);
05f3fb3c
JA
6362 percpu_ref_switch_to_percpu(&data->refs);
6363}
65e19f54 6364
05f3fb3c
JA
6365static void io_file_data_ref_zero(struct percpu_ref *ref)
6366{
6367 struct fixed_file_data *data;
6368
6369 data = container_of(ref, struct fixed_file_data, refs);
6370
2faf852d
JA
6371 /*
6372 * We can't safely switch from inside this context, punt to wq. If
6373 * the table ref is going away, the table is being unregistered.
6374 * Don't queue up the async work for that case, the caller will
6375 * handle it.
6376 */
6377 if (!percpu_ref_is_dying(&data->refs))
6378 queue_work(system_wq, &data->ref_work);
65e19f54
JA
6379}
6380
6b06314c
JA
6381static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
6382 unsigned nr_args)
6383{
6384 __s32 __user *fds = (__s32 __user *) arg;
65e19f54 6385 unsigned nr_tables;
05f3fb3c 6386 struct file *file;
6b06314c
JA
6387 int fd, ret = 0;
6388 unsigned i;
6389
05f3fb3c 6390 if (ctx->file_data)
6b06314c
JA
6391 return -EBUSY;
6392 if (!nr_args)
6393 return -EINVAL;
6394 if (nr_args > IORING_MAX_FIXED_FILES)
6395 return -EMFILE;
6396
05f3fb3c
JA
6397 ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
6398 if (!ctx->file_data)
6399 return -ENOMEM;
6400 ctx->file_data->ctx = ctx;
6401 init_completion(&ctx->file_data->done);
6402
65e19f54 6403 nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
05f3fb3c
JA
6404 ctx->file_data->table = kcalloc(nr_tables,
6405 sizeof(struct fixed_file_table),
65e19f54 6406 GFP_KERNEL);
05f3fb3c
JA
6407 if (!ctx->file_data->table) {
6408 kfree(ctx->file_data);
6409 ctx->file_data = NULL;
6b06314c 6410 return -ENOMEM;
05f3fb3c
JA
6411 }
6412
6413 if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
6414 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6415 kfree(ctx->file_data->table);
6416 kfree(ctx->file_data);
6417 ctx->file_data = NULL;
6b06314c 6418 return -ENOMEM;
05f3fb3c
JA
6419 }
6420 ctx->file_data->put_llist.first = NULL;
6421 INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
6b06314c 6422
65e19f54 6423 if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
05f3fb3c
JA
6424 percpu_ref_exit(&ctx->file_data->refs);
6425 kfree(ctx->file_data->table);
6426 kfree(ctx->file_data);
6427 ctx->file_data = NULL;
65e19f54
JA
6428 return -ENOMEM;
6429 }
6430
08a45173 6431 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
65e19f54
JA
6432 struct fixed_file_table *table;
6433 unsigned index;
6434
6b06314c
JA
6435 ret = -EFAULT;
6436 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
6437 break;
08a45173
JA
6438 /* allow sparse sets */
6439 if (fd == -1) {
6440 ret = 0;
6441 continue;
6442 }
6b06314c 6443
05f3fb3c 6444 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54 6445 index = i & IORING_FILE_TABLE_MASK;
05f3fb3c 6446 file = fget(fd);
6b06314c
JA
6447
6448 ret = -EBADF;
05f3fb3c 6449 if (!file)
6b06314c 6450 break;
05f3fb3c 6451
6b06314c
JA
6452 /*
6453 * Don't allow io_uring instances to be registered. If UNIX
6454 * isn't enabled, then this causes a reference cycle and this
6455 * instance can never get freed. If UNIX is enabled we'll
6456 * handle it just fine, but there's still no point in allowing
6457 * a ring fd as it doesn't support regular read/write anyway.
6458 */
05f3fb3c
JA
6459 if (file->f_op == &io_uring_fops) {
6460 fput(file);
6b06314c
JA
6461 break;
6462 }
6b06314c 6463 ret = 0;
05f3fb3c 6464 table->files[index] = file;
6b06314c
JA
6465 }
6466
6467 if (ret) {
65e19f54 6468 for (i = 0; i < ctx->nr_user_files; i++) {
65e19f54
JA
6469 file = io_file_from_index(ctx, i);
6470 if (file)
6471 fput(file);
6472 }
6473 for (i = 0; i < nr_tables; i++)
05f3fb3c 6474 kfree(ctx->file_data->table[i].files);
6b06314c 6475
05f3fb3c
JA
6476 kfree(ctx->file_data->table);
6477 kfree(ctx->file_data);
6478 ctx->file_data = NULL;
6b06314c
JA
6479 ctx->nr_user_files = 0;
6480 return ret;
6481 }
6482
6483 ret = io_sqe_files_scm(ctx);
6484 if (ret)
6485 io_sqe_files_unregister(ctx);
6486
6487 return ret;
6488}
6489
c3a31e60
JA
6490static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
6491 int index)
6492{
6493#if defined(CONFIG_UNIX)
6494 struct sock *sock = ctx->ring_sock->sk;
6495 struct sk_buff_head *head = &sock->sk_receive_queue;
6496 struct sk_buff *skb;
6497
6498 /*
6499 * See if we can merge this file into an existing skb SCM_RIGHTS
6500 * file set. If there's no room, fall back to allocating a new skb
6501 * and filling it in.
6502 */
6503 spin_lock_irq(&head->lock);
6504 skb = skb_peek(head);
6505 if (skb) {
6506 struct scm_fp_list *fpl = UNIXCB(skb).fp;
6507
6508 if (fpl->count < SCM_MAX_FD) {
6509 __skb_unlink(skb, head);
6510 spin_unlock_irq(&head->lock);
6511 fpl->fp[fpl->count] = get_file(file);
6512 unix_inflight(fpl->user, fpl->fp[fpl->count]);
6513 fpl->count++;
6514 spin_lock_irq(&head->lock);
6515 __skb_queue_head(head, skb);
6516 } else {
6517 skb = NULL;
6518 }
6519 }
6520 spin_unlock_irq(&head->lock);
6521
6522 if (skb) {
6523 fput(file);
6524 return 0;
6525 }
6526
6527 return __io_sqe_files_scm(ctx, 1, index);
6528#else
6529 return 0;
6530#endif
6531}
6532
05f3fb3c 6533static void io_atomic_switch(struct percpu_ref *ref)
c3a31e60 6534{
05f3fb3c
JA
6535 struct fixed_file_data *data;
6536
dd3db2a3
JA
6537 /*
6538 * Juggle reference to ensure we hit zero, if needed, so we can
6539 * switch back to percpu mode
6540 */
05f3fb3c 6541 data = container_of(ref, struct fixed_file_data, refs);
dd3db2a3
JA
6542 percpu_ref_put(&data->refs);
6543 percpu_ref_get(&data->refs);
05f3fb3c
JA
6544}
6545
6546static bool io_queue_file_removal(struct fixed_file_data *data,
6547 struct file *file)
6548{
6549 struct io_file_put *pfile, pfile_stack;
6550 DECLARE_COMPLETION_ONSTACK(done);
6551
6552 /*
6553 * If we fail allocating the struct we need for doing async reomval
6554 * of this file, just punt to sync and wait for it.
6555 */
6556 pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
6557 if (!pfile) {
6558 pfile = &pfile_stack;
6559 pfile->done = &done;
6560 }
6561
6562 pfile->file = file;
6563 llist_add(&pfile->llist, &data->put_llist);
6564
6565 if (pfile == &pfile_stack) {
dd3db2a3 6566 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
05f3fb3c
JA
6567 wait_for_completion(&done);
6568 flush_work(&data->ref_work);
6569 return false;
6570 }
6571
6572 return true;
6573}
6574
6575static int __io_sqe_files_update(struct io_ring_ctx *ctx,
6576 struct io_uring_files_update *up,
6577 unsigned nr_args)
6578{
6579 struct fixed_file_data *data = ctx->file_data;
6580 bool ref_switch = false;
6581 struct file *file;
c3a31e60
JA
6582 __s32 __user *fds;
6583 int fd, i, err;
6584 __u32 done;
6585
05f3fb3c 6586 if (check_add_overflow(up->offset, nr_args, &done))
c3a31e60
JA
6587 return -EOVERFLOW;
6588 if (done > ctx->nr_user_files)
6589 return -EINVAL;
6590
6591 done = 0;
05f3fb3c 6592 fds = u64_to_user_ptr(up->fds);
c3a31e60 6593 while (nr_args) {
65e19f54
JA
6594 struct fixed_file_table *table;
6595 unsigned index;
6596
c3a31e60
JA
6597 err = 0;
6598 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
6599 err = -EFAULT;
6600 break;
6601 }
05f3fb3c
JA
6602 i = array_index_nospec(up->offset, ctx->nr_user_files);
6603 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
65e19f54
JA
6604 index = i & IORING_FILE_TABLE_MASK;
6605 if (table->files[index]) {
05f3fb3c 6606 file = io_file_from_index(ctx, index);
65e19f54 6607 table->files[index] = NULL;
05f3fb3c
JA
6608 if (io_queue_file_removal(data, file))
6609 ref_switch = true;
c3a31e60
JA
6610 }
6611 if (fd != -1) {
c3a31e60
JA
6612 file = fget(fd);
6613 if (!file) {
6614 err = -EBADF;
6615 break;
6616 }
6617 /*
6618 * Don't allow io_uring instances to be registered. If
6619 * UNIX isn't enabled, then this causes a reference
6620 * cycle and this instance can never get freed. If UNIX
6621 * is enabled we'll handle it just fine, but there's
6622 * still no point in allowing a ring fd as it doesn't
6623 * support regular read/write anyway.
6624 */
6625 if (file->f_op == &io_uring_fops) {
6626 fput(file);
6627 err = -EBADF;
6628 break;
6629 }
65e19f54 6630 table->files[index] = file;
c3a31e60
JA
6631 err = io_sqe_file_register(ctx, file, i);
6632 if (err)
6633 break;
6634 }
6635 nr_args--;
6636 done++;
05f3fb3c
JA
6637 up->offset++;
6638 }
6639
dd3db2a3 6640 if (ref_switch)
05f3fb3c 6641 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
c3a31e60
JA
6642
6643 return done ? done : err;
6644}
05f3fb3c
JA
6645static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
6646 unsigned nr_args)
6647{
6648 struct io_uring_files_update up;
6649
6650 if (!ctx->file_data)
6651 return -ENXIO;
6652 if (!nr_args)
6653 return -EINVAL;
6654 if (copy_from_user(&up, arg, sizeof(up)))
6655 return -EFAULT;
6656 if (up.resv)
6657 return -EINVAL;
6658
6659 return __io_sqe_files_update(ctx, &up, nr_args);
6660}
c3a31e60 6661
e9fd9396 6662static void io_free_work(struct io_wq_work *work)
7d723065
JA
6663{
6664 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6665
e9fd9396 6666 /* Consider that io_steal_work() relies on this ref */
7d723065
JA
6667 io_put_req(req);
6668}
6669
24369c2e
PB
6670static int io_init_wq_offload(struct io_ring_ctx *ctx,
6671 struct io_uring_params *p)
6672{
6673 struct io_wq_data data;
6674 struct fd f;
6675 struct io_ring_ctx *ctx_attach;
6676 unsigned int concurrency;
6677 int ret = 0;
6678
6679 data.user = ctx->user;
e9fd9396 6680 data.free_work = io_free_work;
24369c2e
PB
6681
6682 if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
6683 /* Do QD, or 4 * CPUS, whatever is smallest */
6684 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
6685
6686 ctx->io_wq = io_wq_create(concurrency, &data);
6687 if (IS_ERR(ctx->io_wq)) {
6688 ret = PTR_ERR(ctx->io_wq);
6689 ctx->io_wq = NULL;
6690 }
6691 return ret;
6692 }
6693
6694 f = fdget(p->wq_fd);
6695 if (!f.file)
6696 return -EBADF;
6697
6698 if (f.file->f_op != &io_uring_fops) {
6699 ret = -EINVAL;
6700 goto out_fput;
6701 }
6702
6703 ctx_attach = f.file->private_data;
6704 /* @io_wq is protected by holding the fd */
6705 if (!io_wq_get(ctx_attach->io_wq, &data)) {
6706 ret = -EINVAL;
6707 goto out_fput;
6708 }
6709
6710 ctx->io_wq = ctx_attach->io_wq;
6711out_fput:
6712 fdput(f);
6713 return ret;
6714}
6715
6c271ce2
JA
6716static int io_sq_offload_start(struct io_ring_ctx *ctx,
6717 struct io_uring_params *p)
2b188cc1
JA
6718{
6719 int ret;
6720
6c271ce2 6721 init_waitqueue_head(&ctx->sqo_wait);
2b188cc1
JA
6722 mmgrab(current->mm);
6723 ctx->sqo_mm = current->mm;
6724
6c271ce2 6725 if (ctx->flags & IORING_SETUP_SQPOLL) {
3ec482d1
JA
6726 ret = -EPERM;
6727 if (!capable(CAP_SYS_ADMIN))
6728 goto err;
6729
917257da
JA
6730 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
6731 if (!ctx->sq_thread_idle)
6732 ctx->sq_thread_idle = HZ;
6733
6c271ce2 6734 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 6735 int cpu = p->sq_thread_cpu;
6c271ce2 6736
917257da 6737 ret = -EINVAL;
44a9bd18
JA
6738 if (cpu >= nr_cpu_ids)
6739 goto err;
7889f44d 6740 if (!cpu_online(cpu))
917257da
JA
6741 goto err;
6742
6c271ce2
JA
6743 ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
6744 ctx, cpu,
6745 "io_uring-sq");
6746 } else {
6747 ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
6748 "io_uring-sq");
6749 }
6750 if (IS_ERR(ctx->sqo_thread)) {
6751 ret = PTR_ERR(ctx->sqo_thread);
6752 ctx->sqo_thread = NULL;
6753 goto err;
6754 }
6755 wake_up_process(ctx->sqo_thread);
6756 } else if (p->flags & IORING_SETUP_SQ_AFF) {
6757 /* Can't have SQ_AFF without SQPOLL */
6758 ret = -EINVAL;
6759 goto err;
6760 }
6761
24369c2e
PB
6762 ret = io_init_wq_offload(ctx, p);
6763 if (ret)
2b188cc1 6764 goto err;
2b188cc1
JA
6765
6766 return 0;
6767err:
54a91f3b 6768 io_finish_async(ctx);
2b188cc1
JA
6769 mmdrop(ctx->sqo_mm);
6770 ctx->sqo_mm = NULL;
6771 return ret;
6772}
6773
6774static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
6775{
6776 atomic_long_sub(nr_pages, &user->locked_vm);
6777}
6778
6779static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
6780{
6781 unsigned long page_limit, cur_pages, new_pages;
6782
6783 /* Don't allow more pages than we can safely lock */
6784 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
6785
6786 do {
6787 cur_pages = atomic_long_read(&user->locked_vm);
6788 new_pages = cur_pages + nr_pages;
6789 if (new_pages > page_limit)
6790 return -ENOMEM;
6791 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
6792 new_pages) != cur_pages);
6793
6794 return 0;
6795}
6796
6797static void io_mem_free(void *ptr)
6798{
52e04ef4
MR
6799 struct page *page;
6800
6801 if (!ptr)
6802 return;
2b188cc1 6803
52e04ef4 6804 page = virt_to_head_page(ptr);
2b188cc1
JA
6805 if (put_page_testzero(page))
6806 free_compound_page(page);
6807}
6808
6809static void *io_mem_alloc(size_t size)
6810{
6811 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
6812 __GFP_NORETRY;
6813
6814 return (void *) __get_free_pages(gfp_flags, get_order(size));
6815}
6816
75b28aff
HV
6817static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
6818 size_t *sq_offset)
6819{
6820 struct io_rings *rings;
6821 size_t off, sq_array_size;
6822
6823 off = struct_size(rings, cqes, cq_entries);
6824 if (off == SIZE_MAX)
6825 return SIZE_MAX;
6826
6827#ifdef CONFIG_SMP
6828 off = ALIGN(off, SMP_CACHE_BYTES);
6829 if (off == 0)
6830 return SIZE_MAX;
6831#endif
6832
6833 sq_array_size = array_size(sizeof(u32), sq_entries);
6834 if (sq_array_size == SIZE_MAX)
6835 return SIZE_MAX;
6836
6837 if (check_add_overflow(off, sq_array_size, &off))
6838 return SIZE_MAX;
6839
6840 if (sq_offset)
6841 *sq_offset = off;
6842
6843 return off;
6844}
6845
2b188cc1
JA
6846static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
6847{
75b28aff 6848 size_t pages;
2b188cc1 6849
75b28aff
HV
6850 pages = (size_t)1 << get_order(
6851 rings_size(sq_entries, cq_entries, NULL));
6852 pages += (size_t)1 << get_order(
6853 array_size(sizeof(struct io_uring_sqe), sq_entries));
2b188cc1 6854
75b28aff 6855 return pages;
2b188cc1
JA
6856}
6857
edafccee
JA
6858static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
6859{
6860 int i, j;
6861
6862 if (!ctx->user_bufs)
6863 return -ENXIO;
6864
6865 for (i = 0; i < ctx->nr_user_bufs; i++) {
6866 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6867
6868 for (j = 0; j < imu->nr_bvecs; j++)
f1f6a7dd 6869 unpin_user_page(imu->bvec[j].bv_page);
edafccee
JA
6870
6871 if (ctx->account_mem)
6872 io_unaccount_mem(ctx->user, imu->nr_bvecs);
d4ef6475 6873 kvfree(imu->bvec);
edafccee
JA
6874 imu->nr_bvecs = 0;
6875 }
6876
6877 kfree(ctx->user_bufs);
6878 ctx->user_bufs = NULL;
6879 ctx->nr_user_bufs = 0;
6880 return 0;
6881}
6882
6883static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
6884 void __user *arg, unsigned index)
6885{
6886 struct iovec __user *src;
6887
6888#ifdef CONFIG_COMPAT
6889 if (ctx->compat) {
6890 struct compat_iovec __user *ciovs;
6891 struct compat_iovec ciov;
6892
6893 ciovs = (struct compat_iovec __user *) arg;
6894 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
6895 return -EFAULT;
6896
d55e5f5b 6897 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
6898 dst->iov_len = ciov.iov_len;
6899 return 0;
6900 }
6901#endif
6902 src = (struct iovec __user *) arg;
6903 if (copy_from_user(dst, &src[index], sizeof(*dst)))
6904 return -EFAULT;
6905 return 0;
6906}
6907
6908static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
6909 unsigned nr_args)
6910{
6911 struct vm_area_struct **vmas = NULL;
6912 struct page **pages = NULL;
6913 int i, j, got_pages = 0;
6914 int ret = -EINVAL;
6915
6916 if (ctx->user_bufs)
6917 return -EBUSY;
6918 if (!nr_args || nr_args > UIO_MAXIOV)
6919 return -EINVAL;
6920
6921 ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
6922 GFP_KERNEL);
6923 if (!ctx->user_bufs)
6924 return -ENOMEM;
6925
6926 for (i = 0; i < nr_args; i++) {
6927 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6928 unsigned long off, start, end, ubuf;
6929 int pret, nr_pages;
6930 struct iovec iov;
6931 size_t size;
6932
6933 ret = io_copy_iov(ctx, &iov, arg, i);
6934 if (ret)
a278682d 6935 goto err;
edafccee
JA
6936
6937 /*
6938 * Don't impose further limits on the size and buffer
6939 * constraints here, we'll -EINVAL later when IO is
6940 * submitted if they are wrong.
6941 */
6942 ret = -EFAULT;
6943 if (!iov.iov_base || !iov.iov_len)
6944 goto err;
6945
6946 /* arbitrary limit, but we need something */
6947 if (iov.iov_len > SZ_1G)
6948 goto err;
6949
6950 ubuf = (unsigned long) iov.iov_base;
6951 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
6952 start = ubuf >> PAGE_SHIFT;
6953 nr_pages = end - start;
6954
6955 if (ctx->account_mem) {
6956 ret = io_account_mem(ctx->user, nr_pages);
6957 if (ret)
6958 goto err;
6959 }
6960
6961 ret = 0;
6962 if (!pages || nr_pages > got_pages) {
6963 kfree(vmas);
6964 kfree(pages);
d4ef6475 6965 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
edafccee 6966 GFP_KERNEL);
d4ef6475 6967 vmas = kvmalloc_array(nr_pages,
edafccee
JA
6968 sizeof(struct vm_area_struct *),
6969 GFP_KERNEL);
6970 if (!pages || !vmas) {
6971 ret = -ENOMEM;
6972 if (ctx->account_mem)
6973 io_unaccount_mem(ctx->user, nr_pages);
6974 goto err;
6975 }
6976 got_pages = nr_pages;
6977 }
6978
d4ef6475 6979 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
edafccee
JA
6980 GFP_KERNEL);
6981 ret = -ENOMEM;
6982 if (!imu->bvec) {
6983 if (ctx->account_mem)
6984 io_unaccount_mem(ctx->user, nr_pages);
6985 goto err;
6986 }
6987
6988 ret = 0;
6989 down_read(&current->mm->mmap_sem);
2113b05d 6990 pret = pin_user_pages(ubuf, nr_pages,
932f4a63
IW
6991 FOLL_WRITE | FOLL_LONGTERM,
6992 pages, vmas);
edafccee
JA
6993 if (pret == nr_pages) {
6994 /* don't support file backed memory */
6995 for (j = 0; j < nr_pages; j++) {
6996 struct vm_area_struct *vma = vmas[j];
6997
6998 if (vma->vm_file &&
6999 !is_file_hugepages(vma->vm_file)) {
7000 ret = -EOPNOTSUPP;
7001 break;
7002 }
7003 }
7004 } else {
7005 ret = pret < 0 ? pret : -EFAULT;
7006 }
7007 up_read(&current->mm->mmap_sem);
7008 if (ret) {
7009 /*
7010 * if we did partial map, or found file backed vmas,
7011 * release any pages we did get
7012 */
27c4d3a3 7013 if (pret > 0)
f1f6a7dd 7014 unpin_user_pages(pages, pret);
edafccee
JA
7015 if (ctx->account_mem)
7016 io_unaccount_mem(ctx->user, nr_pages);
d4ef6475 7017 kvfree(imu->bvec);
edafccee
JA
7018 goto err;
7019 }
7020
7021 off = ubuf & ~PAGE_MASK;
7022 size = iov.iov_len;
7023 for (j = 0; j < nr_pages; j++) {
7024 size_t vec_len;
7025
7026 vec_len = min_t(size_t, size, PAGE_SIZE - off);
7027 imu->bvec[j].bv_page = pages[j];
7028 imu->bvec[j].bv_len = vec_len;
7029 imu->bvec[j].bv_offset = off;
7030 off = 0;
7031 size -= vec_len;
7032 }
7033 /* store original address for later verification */
7034 imu->ubuf = ubuf;
7035 imu->len = iov.iov_len;
7036 imu->nr_bvecs = nr_pages;
7037
7038 ctx->nr_user_bufs++;
7039 }
d4ef6475
MR
7040 kvfree(pages);
7041 kvfree(vmas);
edafccee
JA
7042 return 0;
7043err:
d4ef6475
MR
7044 kvfree(pages);
7045 kvfree(vmas);
edafccee
JA
7046 io_sqe_buffer_unregister(ctx);
7047 return ret;
7048}
7049
9b402849
JA
7050static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
7051{
7052 __s32 __user *fds = arg;
7053 int fd;
7054
7055 if (ctx->cq_ev_fd)
7056 return -EBUSY;
7057
7058 if (copy_from_user(&fd, fds, sizeof(*fds)))
7059 return -EFAULT;
7060
7061 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
7062 if (IS_ERR(ctx->cq_ev_fd)) {
7063 int ret = PTR_ERR(ctx->cq_ev_fd);
7064 ctx->cq_ev_fd = NULL;
7065 return ret;
7066 }
7067
7068 return 0;
7069}
7070
7071static int io_eventfd_unregister(struct io_ring_ctx *ctx)
7072{
7073 if (ctx->cq_ev_fd) {
7074 eventfd_ctx_put(ctx->cq_ev_fd);
7075 ctx->cq_ev_fd = NULL;
7076 return 0;
7077 }
7078
7079 return -ENXIO;
7080}
7081
5a2e745d
JA
7082static int __io_destroy_buffers(int id, void *p, void *data)
7083{
7084 struct io_ring_ctx *ctx = data;
7085 struct io_buffer *buf = p;
7086
067524e9 7087 __io_remove_buffers(ctx, buf, id, -1U);
5a2e745d
JA
7088 return 0;
7089}
7090
7091static void io_destroy_buffers(struct io_ring_ctx *ctx)
7092{
7093 idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
7094 idr_destroy(&ctx->io_buffer_idr);
7095}
7096
2b188cc1
JA
7097static void io_ring_ctx_free(struct io_ring_ctx *ctx)
7098{
6b06314c 7099 io_finish_async(ctx);
2b188cc1
JA
7100 if (ctx->sqo_mm)
7101 mmdrop(ctx->sqo_mm);
def596e9
JA
7102
7103 io_iopoll_reap_events(ctx);
edafccee 7104 io_sqe_buffer_unregister(ctx);
6b06314c 7105 io_sqe_files_unregister(ctx);
9b402849 7106 io_eventfd_unregister(ctx);
5a2e745d 7107 io_destroy_buffers(ctx);
41726c9a 7108 idr_destroy(&ctx->personality_idr);
def596e9 7109
2b188cc1 7110#if defined(CONFIG_UNIX)
355e8d26
EB
7111 if (ctx->ring_sock) {
7112 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 7113 sock_release(ctx->ring_sock);
355e8d26 7114 }
2b188cc1
JA
7115#endif
7116
75b28aff 7117 io_mem_free(ctx->rings);
2b188cc1 7118 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
7119
7120 percpu_ref_exit(&ctx->refs);
7121 if (ctx->account_mem)
7122 io_unaccount_mem(ctx->user,
7123 ring_pages(ctx->sq_entries, ctx->cq_entries));
7124 free_uid(ctx->user);
181e448d 7125 put_cred(ctx->creds);
206aefde 7126 kfree(ctx->completions);
78076bb6 7127 kfree(ctx->cancel_hash);
0ddf92e8 7128 kmem_cache_free(req_cachep, ctx->fallback_req);
2b188cc1
JA
7129 kfree(ctx);
7130}
7131
7132static __poll_t io_uring_poll(struct file *file, poll_table *wait)
7133{
7134 struct io_ring_ctx *ctx = file->private_data;
7135 __poll_t mask = 0;
7136
7137 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
7138 /*
7139 * synchronizes with barrier from wq_has_sleeper call in
7140 * io_commit_cqring
7141 */
2b188cc1 7142 smp_rmb();
75b28aff
HV
7143 if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
7144 ctx->rings->sq_ring_entries)
2b188cc1 7145 mask |= EPOLLOUT | EPOLLWRNORM;
63e5d81f 7146 if (io_cqring_events(ctx, false))
2b188cc1
JA
7147 mask |= EPOLLIN | EPOLLRDNORM;
7148
7149 return mask;
7150}
7151
7152static int io_uring_fasync(int fd, struct file *file, int on)
7153{
7154 struct io_ring_ctx *ctx = file->private_data;
7155
7156 return fasync_helper(fd, file, on, &ctx->cq_fasync);
7157}
7158
071698e1
JA
7159static int io_remove_personalities(int id, void *p, void *data)
7160{
7161 struct io_ring_ctx *ctx = data;
7162 const struct cred *cred;
7163
7164 cred = idr_remove(&ctx->personality_idr, id);
7165 if (cred)
7166 put_cred(cred);
7167 return 0;
7168}
7169
2b188cc1
JA
7170static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
7171{
7172 mutex_lock(&ctx->uring_lock);
7173 percpu_ref_kill(&ctx->refs);
7174 mutex_unlock(&ctx->uring_lock);
7175
df069d80
JA
7176 /*
7177 * Wait for sq thread to idle, if we have one. It won't spin on new
7178 * work after we've killed the ctx ref above. This is important to do
7179 * before we cancel existing commands, as the thread could otherwise
7180 * be queueing new work post that. If that's work we need to cancel,
7181 * it could cause shutdown to hang.
7182 */
7183 while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
7184 cpu_relax();
7185
5262f567 7186 io_kill_timeouts(ctx);
221c5eb2 7187 io_poll_remove_all(ctx);
561fb04a
JA
7188
7189 if (ctx->io_wq)
7190 io_wq_cancel_all(ctx->io_wq);
7191
def596e9 7192 io_iopoll_reap_events(ctx);
15dff286
JA
7193 /* if we failed setting up the ctx, we might not have any rings */
7194 if (ctx->rings)
7195 io_cqring_overflow_flush(ctx, true);
071698e1 7196 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
206aefde 7197 wait_for_completion(&ctx->completions[0]);
2b188cc1
JA
7198 io_ring_ctx_free(ctx);
7199}
7200
7201static int io_uring_release(struct inode *inode, struct file *file)
7202{
7203 struct io_ring_ctx *ctx = file->private_data;
7204
7205 file->private_data = NULL;
7206 io_ring_ctx_wait_and_kill(ctx);
7207 return 0;
7208}
7209
fcb323cc
JA
7210static void io_uring_cancel_files(struct io_ring_ctx *ctx,
7211 struct files_struct *files)
7212{
7213 struct io_kiocb *req;
7214 DEFINE_WAIT(wait);
7215
7216 while (!list_empty_careful(&ctx->inflight_list)) {
768134d4 7217 struct io_kiocb *cancel_req = NULL;
fcb323cc
JA
7218
7219 spin_lock_irq(&ctx->inflight_lock);
7220 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
768134d4
JA
7221 if (req->work.files != files)
7222 continue;
7223 /* req is being completed, ignore */
7224 if (!refcount_inc_not_zero(&req->refs))
7225 continue;
7226 cancel_req = req;
7227 break;
fcb323cc 7228 }
768134d4 7229 if (cancel_req)
fcb323cc 7230 prepare_to_wait(&ctx->inflight_wait, &wait,
768134d4 7231 TASK_UNINTERRUPTIBLE);
fcb323cc
JA
7232 spin_unlock_irq(&ctx->inflight_lock);
7233
768134d4
JA
7234 /* We need to keep going until we don't find a matching req */
7235 if (!cancel_req)
fcb323cc 7236 break;
2f6d9b9d 7237
2ca10259
JA
7238 if (cancel_req->flags & REQ_F_OVERFLOW) {
7239 spin_lock_irq(&ctx->completion_lock);
7240 list_del(&cancel_req->list);
7241 cancel_req->flags &= ~REQ_F_OVERFLOW;
7242 if (list_empty(&ctx->cq_overflow_list)) {
7243 clear_bit(0, &ctx->sq_check_overflow);
7244 clear_bit(0, &ctx->cq_check_overflow);
7245 }
7246 spin_unlock_irq(&ctx->completion_lock);
7247
7248 WRITE_ONCE(ctx->rings->cq_overflow,
7249 atomic_inc_return(&ctx->cached_cq_overflow));
7250
7251 /*
7252 * Put inflight ref and overflow ref. If that's
7253 * all we had, then we're done with this request.
7254 */
7255 if (refcount_sub_and_test(2, &cancel_req->refs)) {
7256 io_put_req(cancel_req);
7257 continue;
7258 }
7259 }
7260
2f6d9b9d
BL
7261 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
7262 io_put_req(cancel_req);
fcb323cc
JA
7263 schedule();
7264 }
768134d4 7265 finish_wait(&ctx->inflight_wait, &wait);
fcb323cc
JA
7266}
7267
7268static int io_uring_flush(struct file *file, void *data)
7269{
7270 struct io_ring_ctx *ctx = file->private_data;
7271
7272 io_uring_cancel_files(ctx, data);
6ab23144
JA
7273
7274 /*
7275 * If the task is going away, cancel work it may have pending
7276 */
7277 if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
7278 io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
7279
fcb323cc
JA
7280 return 0;
7281}
7282
6c5c240e
RP
7283static void *io_uring_validate_mmap_request(struct file *file,
7284 loff_t pgoff, size_t sz)
2b188cc1 7285{
2b188cc1 7286 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 7287 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
7288 struct page *page;
7289 void *ptr;
7290
7291 switch (offset) {
7292 case IORING_OFF_SQ_RING:
75b28aff
HV
7293 case IORING_OFF_CQ_RING:
7294 ptr = ctx->rings;
2b188cc1
JA
7295 break;
7296 case IORING_OFF_SQES:
7297 ptr = ctx->sq_sqes;
7298 break;
2b188cc1 7299 default:
6c5c240e 7300 return ERR_PTR(-EINVAL);
2b188cc1
JA
7301 }
7302
7303 page = virt_to_head_page(ptr);
a50b854e 7304 if (sz > page_size(page))
6c5c240e
RP
7305 return ERR_PTR(-EINVAL);
7306
7307 return ptr;
7308}
7309
7310#ifdef CONFIG_MMU
7311
7312static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7313{
7314 size_t sz = vma->vm_end - vma->vm_start;
7315 unsigned long pfn;
7316 void *ptr;
7317
7318 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
7319 if (IS_ERR(ptr))
7320 return PTR_ERR(ptr);
2b188cc1
JA
7321
7322 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
7323 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
7324}
7325
6c5c240e
RP
7326#else /* !CONFIG_MMU */
7327
7328static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7329{
7330 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
7331}
7332
7333static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
7334{
7335 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
7336}
7337
7338static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
7339 unsigned long addr, unsigned long len,
7340 unsigned long pgoff, unsigned long flags)
7341{
7342 void *ptr;
7343
7344 ptr = io_uring_validate_mmap_request(file, pgoff, len);
7345 if (IS_ERR(ptr))
7346 return PTR_ERR(ptr);
7347
7348 return (unsigned long) ptr;
7349}
7350
7351#endif /* !CONFIG_MMU */
7352
2b188cc1
JA
7353SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
7354 u32, min_complete, u32, flags, const sigset_t __user *, sig,
7355 size_t, sigsz)
7356{
7357 struct io_ring_ctx *ctx;
7358 long ret = -EBADF;
7359 int submitted = 0;
7360 struct fd f;
7361
b41e9852
JA
7362 if (current->task_works)
7363 task_work_run();
7364
6c271ce2 7365 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
2b188cc1
JA
7366 return -EINVAL;
7367
7368 f = fdget(fd);
7369 if (!f.file)
7370 return -EBADF;
7371
7372 ret = -EOPNOTSUPP;
7373 if (f.file->f_op != &io_uring_fops)
7374 goto out_fput;
7375
7376 ret = -ENXIO;
7377 ctx = f.file->private_data;
7378 if (!percpu_ref_tryget(&ctx->refs))
7379 goto out_fput;
7380
6c271ce2
JA
7381 /*
7382 * For SQ polling, the thread will do all submissions and completions.
7383 * Just return the requested submit count, and wake the thread if
7384 * we were asked to.
7385 */
b2a9eada 7386 ret = 0;
6c271ce2 7387 if (ctx->flags & IORING_SETUP_SQPOLL) {
c1edbf5f
JA
7388 if (!list_empty_careful(&ctx->cq_overflow_list))
7389 io_cqring_overflow_flush(ctx, false);
6c271ce2
JA
7390 if (flags & IORING_ENTER_SQ_WAKEUP)
7391 wake_up(&ctx->sqo_wait);
7392 submitted = to_submit;
b2a9eada 7393 } else if (to_submit) {
ae9428ca 7394 struct mm_struct *cur_mm;
2b188cc1
JA
7395
7396 mutex_lock(&ctx->uring_lock);
ae9428ca
PB
7397 /* already have mm, so io_submit_sqes() won't try to grab it */
7398 cur_mm = ctx->sqo_mm;
7399 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
7400 &cur_mm, false);
2b188cc1 7401 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
7402
7403 if (submitted != to_submit)
7404 goto out;
2b188cc1
JA
7405 }
7406 if (flags & IORING_ENTER_GETEVENTS) {
def596e9
JA
7407 unsigned nr_events = 0;
7408
2b188cc1
JA
7409 min_complete = min(min_complete, ctx->cq_entries);
7410
def596e9 7411 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9 7412 ret = io_iopoll_check(ctx, &nr_events, min_complete);
def596e9
JA
7413 } else {
7414 ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
7415 }
2b188cc1
JA
7416 }
7417
7c504e65 7418out:
6805b32e 7419 percpu_ref_put(&ctx->refs);
2b188cc1
JA
7420out_fput:
7421 fdput(f);
7422 return submitted ? submitted : ret;
7423}
7424
bebdb65e 7425#ifdef CONFIG_PROC_FS
87ce955b
JA
7426static int io_uring_show_cred(int id, void *p, void *data)
7427{
7428 const struct cred *cred = p;
7429 struct seq_file *m = data;
7430 struct user_namespace *uns = seq_user_ns(m);
7431 struct group_info *gi;
7432 kernel_cap_t cap;
7433 unsigned __capi;
7434 int g;
7435
7436 seq_printf(m, "%5d\n", id);
7437 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
7438 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
7439 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
7440 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
7441 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
7442 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
7443 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
7444 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
7445 seq_puts(m, "\n\tGroups:\t");
7446 gi = cred->group_info;
7447 for (g = 0; g < gi->ngroups; g++) {
7448 seq_put_decimal_ull(m, g ? " " : "",
7449 from_kgid_munged(uns, gi->gid[g]));
7450 }
7451 seq_puts(m, "\n\tCapEff:\t");
7452 cap = cred->cap_effective;
7453 CAP_FOR_EACH_U32(__capi)
7454 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
7455 seq_putc(m, '\n');
7456 return 0;
7457}
7458
7459static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
7460{
7461 int i;
7462
7463 mutex_lock(&ctx->uring_lock);
7464 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
7465 for (i = 0; i < ctx->nr_user_files; i++) {
7466 struct fixed_file_table *table;
7467 struct file *f;
7468
7469 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7470 f = table->files[i & IORING_FILE_TABLE_MASK];
7471 if (f)
7472 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
7473 else
7474 seq_printf(m, "%5u: <none>\n", i);
7475 }
7476 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
7477 for (i = 0; i < ctx->nr_user_bufs; i++) {
7478 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
7479
7480 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
7481 (unsigned int) buf->len);
7482 }
7483 if (!idr_is_empty(&ctx->personality_idr)) {
7484 seq_printf(m, "Personalities:\n");
7485 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
7486 }
d7718a9d
JA
7487 seq_printf(m, "PollList:\n");
7488 spin_lock_irq(&ctx->completion_lock);
7489 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7490 struct hlist_head *list = &ctx->cancel_hash[i];
7491 struct io_kiocb *req;
7492
7493 hlist_for_each_entry(req, list, hash_node)
7494 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
7495 req->task->task_works != NULL);
7496 }
7497 spin_unlock_irq(&ctx->completion_lock);
87ce955b
JA
7498 mutex_unlock(&ctx->uring_lock);
7499}
7500
7501static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
7502{
7503 struct io_ring_ctx *ctx = f->private_data;
7504
7505 if (percpu_ref_tryget(&ctx->refs)) {
7506 __io_uring_show_fdinfo(ctx, m);
7507 percpu_ref_put(&ctx->refs);
7508 }
7509}
bebdb65e 7510#endif
87ce955b 7511
2b188cc1
JA
7512static const struct file_operations io_uring_fops = {
7513 .release = io_uring_release,
fcb323cc 7514 .flush = io_uring_flush,
2b188cc1 7515 .mmap = io_uring_mmap,
6c5c240e
RP
7516#ifndef CONFIG_MMU
7517 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
7518 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
7519#endif
2b188cc1
JA
7520 .poll = io_uring_poll,
7521 .fasync = io_uring_fasync,
bebdb65e 7522#ifdef CONFIG_PROC_FS
87ce955b 7523 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 7524#endif
2b188cc1
JA
7525};
7526
7527static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
7528 struct io_uring_params *p)
7529{
75b28aff
HV
7530 struct io_rings *rings;
7531 size_t size, sq_array_offset;
2b188cc1 7532
75b28aff
HV
7533 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
7534 if (size == SIZE_MAX)
7535 return -EOVERFLOW;
7536
7537 rings = io_mem_alloc(size);
7538 if (!rings)
2b188cc1
JA
7539 return -ENOMEM;
7540
75b28aff
HV
7541 ctx->rings = rings;
7542 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
7543 rings->sq_ring_mask = p->sq_entries - 1;
7544 rings->cq_ring_mask = p->cq_entries - 1;
7545 rings->sq_ring_entries = p->sq_entries;
7546 rings->cq_ring_entries = p->cq_entries;
7547 ctx->sq_mask = rings->sq_ring_mask;
7548 ctx->cq_mask = rings->cq_ring_mask;
7549 ctx->sq_entries = rings->sq_ring_entries;
7550 ctx->cq_entries = rings->cq_ring_entries;
2b188cc1
JA
7551
7552 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
7553 if (size == SIZE_MAX) {
7554 io_mem_free(ctx->rings);
7555 ctx->rings = NULL;
2b188cc1 7556 return -EOVERFLOW;
eb065d30 7557 }
2b188cc1
JA
7558
7559 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
7560 if (!ctx->sq_sqes) {
7561 io_mem_free(ctx->rings);
7562 ctx->rings = NULL;
2b188cc1 7563 return -ENOMEM;
eb065d30 7564 }
2b188cc1 7565
2b188cc1
JA
7566 return 0;
7567}
7568
7569/*
7570 * Allocate an anonymous fd, this is what constitutes the application
7571 * visible backing of an io_uring instance. The application mmaps this
7572 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
7573 * we have to tie this fd to a socket for file garbage collection purposes.
7574 */
7575static int io_uring_get_fd(struct io_ring_ctx *ctx)
7576{
7577 struct file *file;
7578 int ret;
7579
7580#if defined(CONFIG_UNIX)
7581 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
7582 &ctx->ring_sock);
7583 if (ret)
7584 return ret;
7585#endif
7586
7587 ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
7588 if (ret < 0)
7589 goto err;
7590
7591 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
7592 O_RDWR | O_CLOEXEC);
7593 if (IS_ERR(file)) {
7594 put_unused_fd(ret);
7595 ret = PTR_ERR(file);
7596 goto err;
7597 }
7598
7599#if defined(CONFIG_UNIX)
7600 ctx->ring_sock->file = file;
7601#endif
7602 fd_install(ret, file);
7603 return ret;
7604err:
7605#if defined(CONFIG_UNIX)
7606 sock_release(ctx->ring_sock);
7607 ctx->ring_sock = NULL;
7608#endif
7609 return ret;
7610}
7611
7612static int io_uring_create(unsigned entries, struct io_uring_params *p)
7613{
7614 struct user_struct *user = NULL;
7615 struct io_ring_ctx *ctx;
7616 bool account_mem;
7617 int ret;
7618
8110c1a6 7619 if (!entries)
2b188cc1 7620 return -EINVAL;
8110c1a6
JA
7621 if (entries > IORING_MAX_ENTRIES) {
7622 if (!(p->flags & IORING_SETUP_CLAMP))
7623 return -EINVAL;
7624 entries = IORING_MAX_ENTRIES;
7625 }
2b188cc1
JA
7626
7627 /*
7628 * Use twice as many entries for the CQ ring. It's possible for the
7629 * application to drive a higher depth than the size of the SQ ring,
7630 * since the sqes are only used at submission time. This allows for
33a107f0
JA
7631 * some flexibility in overcommitting a bit. If the application has
7632 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
7633 * of CQ ring entries manually.
2b188cc1
JA
7634 */
7635 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
7636 if (p->flags & IORING_SETUP_CQSIZE) {
7637 /*
7638 * If IORING_SETUP_CQSIZE is set, we do the same roundup
7639 * to a power-of-two, if it isn't already. We do NOT impose
7640 * any cq vs sq ring sizing.
7641 */
8110c1a6 7642 if (p->cq_entries < p->sq_entries)
33a107f0 7643 return -EINVAL;
8110c1a6
JA
7644 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
7645 if (!(p->flags & IORING_SETUP_CLAMP))
7646 return -EINVAL;
7647 p->cq_entries = IORING_MAX_CQ_ENTRIES;
7648 }
33a107f0
JA
7649 p->cq_entries = roundup_pow_of_two(p->cq_entries);
7650 } else {
7651 p->cq_entries = 2 * p->sq_entries;
7652 }
2b188cc1
JA
7653
7654 user = get_uid(current_user());
7655 account_mem = !capable(CAP_IPC_LOCK);
7656
7657 if (account_mem) {
7658 ret = io_account_mem(user,
7659 ring_pages(p->sq_entries, p->cq_entries));
7660 if (ret) {
7661 free_uid(user);
7662 return ret;
7663 }
7664 }
7665
7666 ctx = io_ring_ctx_alloc(p);
7667 if (!ctx) {
7668 if (account_mem)
7669 io_unaccount_mem(user, ring_pages(p->sq_entries,
7670 p->cq_entries));
7671 free_uid(user);
7672 return -ENOMEM;
7673 }
7674 ctx->compat = in_compat_syscall();
7675 ctx->account_mem = account_mem;
7676 ctx->user = user;
0b8c0ec7 7677 ctx->creds = get_current_cred();
2b188cc1
JA
7678
7679 ret = io_allocate_scq_urings(ctx, p);
7680 if (ret)
7681 goto err;
7682
6c271ce2 7683 ret = io_sq_offload_start(ctx, p);
2b188cc1
JA
7684 if (ret)
7685 goto err;
7686
2b188cc1 7687 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
7688 p->sq_off.head = offsetof(struct io_rings, sq.head);
7689 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
7690 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
7691 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
7692 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
7693 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
7694 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
7695
7696 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
7697 p->cq_off.head = offsetof(struct io_rings, cq.head);
7698 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
7699 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
7700 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
7701 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
7702 p->cq_off.cqes = offsetof(struct io_rings, cqes);
ac90f249 7703
044c1ab3
JA
7704 /*
7705 * Install ring fd as the very last thing, so we don't risk someone
7706 * having closed it before we finish setup
7707 */
7708 ret = io_uring_get_fd(ctx);
7709 if (ret < 0)
7710 goto err;
7711
da8c9690 7712 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
cccf0ee8 7713 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
d7718a9d 7714 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
c826bd7a 7715 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
7716 return ret;
7717err:
7718 io_ring_ctx_wait_and_kill(ctx);
7719 return ret;
7720}
7721
7722/*
7723 * Sets up an aio uring context, and returns the fd. Applications asks for a
7724 * ring size, we return the actual sq/cq ring sizes (among other things) in the
7725 * params structure passed in.
7726 */
7727static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
7728{
7729 struct io_uring_params p;
7730 long ret;
7731 int i;
7732
7733 if (copy_from_user(&p, params, sizeof(p)))
7734 return -EFAULT;
7735 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
7736 if (p.resv[i])
7737 return -EINVAL;
7738 }
7739
6c271ce2 7740 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 7741 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
24369c2e 7742 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
2b188cc1
JA
7743 return -EINVAL;
7744
7745 ret = io_uring_create(entries, &p);
7746 if (ret < 0)
7747 return ret;
7748
7749 if (copy_to_user(params, &p, sizeof(p)))
7750 return -EFAULT;
7751
7752 return ret;
7753}
7754
7755SYSCALL_DEFINE2(io_uring_setup, u32, entries,
7756 struct io_uring_params __user *, params)
7757{
7758 return io_uring_setup(entries, params);
7759}
7760
66f4af93
JA
7761static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
7762{
7763 struct io_uring_probe *p;
7764 size_t size;
7765 int i, ret;
7766
7767 size = struct_size(p, ops, nr_args);
7768 if (size == SIZE_MAX)
7769 return -EOVERFLOW;
7770 p = kzalloc(size, GFP_KERNEL);
7771 if (!p)
7772 return -ENOMEM;
7773
7774 ret = -EFAULT;
7775 if (copy_from_user(p, arg, size))
7776 goto out;
7777 ret = -EINVAL;
7778 if (memchr_inv(p, 0, size))
7779 goto out;
7780
7781 p->last_op = IORING_OP_LAST - 1;
7782 if (nr_args > IORING_OP_LAST)
7783 nr_args = IORING_OP_LAST;
7784
7785 for (i = 0; i < nr_args; i++) {
7786 p->ops[i].op = i;
7787 if (!io_op_defs[i].not_supported)
7788 p->ops[i].flags = IO_URING_OP_SUPPORTED;
7789 }
7790 p->ops_len = i;
7791
7792 ret = 0;
7793 if (copy_to_user(arg, p, size))
7794 ret = -EFAULT;
7795out:
7796 kfree(p);
7797 return ret;
7798}
7799
071698e1
JA
7800static int io_register_personality(struct io_ring_ctx *ctx)
7801{
7802 const struct cred *creds = get_current_cred();
7803 int id;
7804
7805 id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
7806 USHRT_MAX, GFP_KERNEL);
7807 if (id < 0)
7808 put_cred(creds);
7809 return id;
7810}
7811
7812static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
7813{
7814 const struct cred *old_creds;
7815
7816 old_creds = idr_remove(&ctx->personality_idr, id);
7817 if (old_creds) {
7818 put_cred(old_creds);
7819 return 0;
7820 }
7821
7822 return -EINVAL;
7823}
7824
7825static bool io_register_op_must_quiesce(int op)
7826{
7827 switch (op) {
7828 case IORING_UNREGISTER_FILES:
7829 case IORING_REGISTER_FILES_UPDATE:
7830 case IORING_REGISTER_PROBE:
7831 case IORING_REGISTER_PERSONALITY:
7832 case IORING_UNREGISTER_PERSONALITY:
7833 return false;
7834 default:
7835 return true;
7836 }
7837}
7838
edafccee
JA
7839static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
7840 void __user *arg, unsigned nr_args)
b19062a5
JA
7841 __releases(ctx->uring_lock)
7842 __acquires(ctx->uring_lock)
edafccee
JA
7843{
7844 int ret;
7845
35fa71a0
JA
7846 /*
7847 * We're inside the ring mutex, if the ref is already dying, then
7848 * someone else killed the ctx or is already going through
7849 * io_uring_register().
7850 */
7851 if (percpu_ref_is_dying(&ctx->refs))
7852 return -ENXIO;
7853
071698e1 7854 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 7855 percpu_ref_kill(&ctx->refs);
b19062a5 7856
05f3fb3c
JA
7857 /*
7858 * Drop uring mutex before waiting for references to exit. If
7859 * another thread is currently inside io_uring_enter() it might
7860 * need to grab the uring_lock to make progress. If we hold it
7861 * here across the drain wait, then we can deadlock. It's safe
7862 * to drop the mutex here, since no new references will come in
7863 * after we've killed the percpu ref.
7864 */
7865 mutex_unlock(&ctx->uring_lock);
c150368b 7866 ret = wait_for_completion_interruptible(&ctx->completions[0]);
05f3fb3c 7867 mutex_lock(&ctx->uring_lock);
c150368b
JA
7868 if (ret) {
7869 percpu_ref_resurrect(&ctx->refs);
7870 ret = -EINTR;
7871 goto out;
7872 }
05f3fb3c 7873 }
edafccee
JA
7874
7875 switch (opcode) {
7876 case IORING_REGISTER_BUFFERS:
7877 ret = io_sqe_buffer_register(ctx, arg, nr_args);
7878 break;
7879 case IORING_UNREGISTER_BUFFERS:
7880 ret = -EINVAL;
7881 if (arg || nr_args)
7882 break;
7883 ret = io_sqe_buffer_unregister(ctx);
7884 break;
6b06314c
JA
7885 case IORING_REGISTER_FILES:
7886 ret = io_sqe_files_register(ctx, arg, nr_args);
7887 break;
7888 case IORING_UNREGISTER_FILES:
7889 ret = -EINVAL;
7890 if (arg || nr_args)
7891 break;
7892 ret = io_sqe_files_unregister(ctx);
7893 break;
c3a31e60
JA
7894 case IORING_REGISTER_FILES_UPDATE:
7895 ret = io_sqe_files_update(ctx, arg, nr_args);
7896 break;
9b402849 7897 case IORING_REGISTER_EVENTFD:
f2842ab5 7898 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
7899 ret = -EINVAL;
7900 if (nr_args != 1)
7901 break;
7902 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
7903 if (ret)
7904 break;
7905 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
7906 ctx->eventfd_async = 1;
7907 else
7908 ctx->eventfd_async = 0;
9b402849
JA
7909 break;
7910 case IORING_UNREGISTER_EVENTFD:
7911 ret = -EINVAL;
7912 if (arg || nr_args)
7913 break;
7914 ret = io_eventfd_unregister(ctx);
7915 break;
66f4af93
JA
7916 case IORING_REGISTER_PROBE:
7917 ret = -EINVAL;
7918 if (!arg || nr_args > 256)
7919 break;
7920 ret = io_probe(ctx, arg, nr_args);
7921 break;
071698e1
JA
7922 case IORING_REGISTER_PERSONALITY:
7923 ret = -EINVAL;
7924 if (arg || nr_args)
7925 break;
7926 ret = io_register_personality(ctx);
7927 break;
7928 case IORING_UNREGISTER_PERSONALITY:
7929 ret = -EINVAL;
7930 if (arg)
7931 break;
7932 ret = io_unregister_personality(ctx, nr_args);
7933 break;
edafccee
JA
7934 default:
7935 ret = -EINVAL;
7936 break;
7937 }
7938
071698e1 7939 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 7940 /* bring the ctx back to life */
05f3fb3c 7941 percpu_ref_reinit(&ctx->refs);
c150368b
JA
7942out:
7943 reinit_completion(&ctx->completions[0]);
05f3fb3c 7944 }
edafccee
JA
7945 return ret;
7946}
7947
7948SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
7949 void __user *, arg, unsigned int, nr_args)
7950{
7951 struct io_ring_ctx *ctx;
7952 long ret = -EBADF;
7953 struct fd f;
7954
7955 f = fdget(fd);
7956 if (!f.file)
7957 return -EBADF;
7958
7959 ret = -EOPNOTSUPP;
7960 if (f.file->f_op != &io_uring_fops)
7961 goto out_fput;
7962
7963 ctx = f.file->private_data;
7964
7965 mutex_lock(&ctx->uring_lock);
7966 ret = __io_uring_register(ctx, opcode, arg, nr_args);
7967 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
7968 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
7969 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
7970out_fput:
7971 fdput(f);
7972 return ret;
7973}
7974
2b188cc1
JA
7975static int __init io_uring_init(void)
7976{
d7f62e82
SM
7977#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
7978 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
7979 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
7980} while (0)
7981
7982#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
7983 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
7984 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
7985 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
7986 BUILD_BUG_SQE_ELEM(1, __u8, flags);
7987 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
7988 BUILD_BUG_SQE_ELEM(4, __s32, fd);
7989 BUILD_BUG_SQE_ELEM(8, __u64, off);
7990 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
7991 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 7992 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
7993 BUILD_BUG_SQE_ELEM(24, __u32, len);
7994 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
7995 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
7996 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
7997 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
7998 BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
7999 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
8000 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
8001 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
8002 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
8003 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
8004 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
8005 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
8006 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 8007 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
8008 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
8009 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
8010 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 8011 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
d7f62e82 8012
d3656344 8013 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
84557871 8014 BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
2b188cc1
JA
8015 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
8016 return 0;
8017};
8018__initcall(io_uring_init);