io_uring: remove extra io_ring_exit_work wake up
[linux-2.6-block.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
d068b506 14 * through a control-dependency in io_get_cqe (smp_store_release to
1e84b97b
SB
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
52de1fe1 47#include <net/compat.h>
2b188cc1
JA
48#include <linux/refcount.h>
49#include <linux/uio.h>
6b47ee6e 50#include <linux/bits.h>
2b188cc1
JA
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
2b188cc1
JA
58#include <linux/percpu.h>
59#include <linux/slab.h>
2b188cc1 60#include <linux/blkdev.h>
edafccee 61#include <linux/bvec.h>
2b188cc1
JA
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
6b06314c 65#include <net/scm.h>
2b188cc1
JA
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
edafccee
JA
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
aa4c3967 72#include <linux/highmem.h>
15b71abe
JA
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
4840e418 75#include <linux/fadvise.h>
3e4827b0 76#include <linux/eventpoll.h>
7d67af2c 77#include <linux/splice.h>
b41e9852 78#include <linux/task_work.h>
bcf5a063 79#include <linux/pagemap.h>
0f212204 80#include <linux/io_uring.h>
ef98eb04 81#include <linux/tracehook.h>
2b188cc1 82
c826bd7a
DD
83#define CREATE_TRACE_POINTS
84#include <trace/events/io_uring.h>
85
2b188cc1
JA
86#include <uapi/linux/io_uring.h>
87
88#include "internal.h"
561fb04a 89#include "io-wq.h"
2b188cc1 90
5277deaa 91#define IORING_MAX_ENTRIES 32768
33a107f0 92#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
4ce8ad95 93#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
65e19f54 94
187f08c1 95/* only define max */
042b0d85 96#define IORING_MAX_FIXED_FILES (1U << 15)
21b55dbc
SG
97#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
98 IORING_REGISTER_LAST + IORING_OP_LAST)
2b188cc1 99
187f08c1 100#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
2d091d62
PB
101#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
102#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
103
489809e2
PB
104#define IORING_MAX_REG_BUFFERS (1U << 14)
105
68fe256a
PB
106#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
107 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
108
109#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
110
c854357b 111#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
d886e185
PB
112 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
113 REQ_F_ASYNC_DATA)
b16fed66 114
09899b19
PB
115#define IO_TCTX_REFS_CACHE_NR (1U << 10)
116
2b188cc1
JA
117struct io_uring {
118 u32 head ____cacheline_aligned_in_smp;
119 u32 tail ____cacheline_aligned_in_smp;
120};
121
1e84b97b 122/*
75b28aff
HV
123 * This data is shared with the application through the mmap at offsets
124 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
125 *
126 * The offsets to the member fields are published through struct
127 * io_sqring_offsets when calling io_uring_setup.
128 */
75b28aff 129struct io_rings {
1e84b97b
SB
130 /*
131 * Head and tail offsets into the ring; the offsets need to be
132 * masked to get valid indices.
133 *
75b28aff
HV
134 * The kernel controls head of the sq ring and the tail of the cq ring,
135 * and the application controls tail of the sq ring and the head of the
136 * cq ring.
1e84b97b 137 */
75b28aff 138 struct io_uring sq, cq;
1e84b97b 139 /*
75b28aff 140 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
141 * ring_entries - 1)
142 */
75b28aff
HV
143 u32 sq_ring_mask, cq_ring_mask;
144 /* Ring sizes (constant, power of 2) */
145 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
146 /*
147 * Number of invalid entries dropped by the kernel due to
148 * invalid index stored in array
149 *
150 * Written by the kernel, shouldn't be modified by the
151 * application (i.e. get number of "new events" by comparing to
152 * cached value).
153 *
154 * After a new SQ head value was read by the application this
155 * counter includes all submissions that were dropped reaching
156 * the new SQ head (and possibly more).
157 */
75b28aff 158 u32 sq_dropped;
1e84b97b 159 /*
0d9b5b3a 160 * Runtime SQ flags
1e84b97b
SB
161 *
162 * Written by the kernel, shouldn't be modified by the
163 * application.
164 *
165 * The application needs a full memory barrier before checking
166 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
167 */
75b28aff 168 u32 sq_flags;
0d9b5b3a
SG
169 /*
170 * Runtime CQ flags
171 *
172 * Written by the application, shouldn't be modified by the
173 * kernel.
174 */
fe7e3257 175 u32 cq_flags;
1e84b97b
SB
176 /*
177 * Number of completion events lost because the queue was full;
178 * this should be avoided by the application by making sure
0b4295b5 179 * there are not more requests pending than there is space in
1e84b97b
SB
180 * the completion queue.
181 *
182 * Written by the kernel, shouldn't be modified by the
183 * application (i.e. get number of "new events" by comparing to
184 * cached value).
185 *
186 * As completion events come in out of order this counter is not
187 * ordered with any other data.
188 */
75b28aff 189 u32 cq_overflow;
1e84b97b
SB
190 /*
191 * Ring buffer of completion events.
192 *
193 * The kernel writes completion events fresh every time they are
194 * produced, so the application is allowed to modify pending
195 * entries.
196 */
75b28aff 197 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
198};
199
45d189c6
PB
200enum io_uring_cmd_flags {
201 IO_URING_F_NONBLOCK = 1,
889fca73 202 IO_URING_F_COMPLETE_DEFER = 2,
45d189c6
PB
203};
204
edafccee
JA
205struct io_mapped_ubuf {
206 u64 ubuf;
4751f53d 207 u64 ubuf_end;
edafccee 208 unsigned int nr_bvecs;
de293938 209 unsigned long acct_pages;
41edf1a5 210 struct bio_vec bvec[];
edafccee
JA
211};
212
50238531
BM
213struct io_ring_ctx;
214
6c2450ae
PB
215struct io_overflow_cqe {
216 struct io_uring_cqe cqe;
217 struct list_head list;
218};
219
a04b0ac0
PB
220struct io_fixed_file {
221 /* file * with additional FFS_* flags */
222 unsigned long file_ptr;
223};
224
269bbe5f
BM
225struct io_rsrc_put {
226 struct list_head list;
b60c8dce 227 u64 tag;
50238531
BM
228 union {
229 void *rsrc;
230 struct file *file;
bd54b6fe 231 struct io_mapped_ubuf *buf;
50238531 232 };
269bbe5f
BM
233};
234
aeca241b 235struct io_file_table {
042b0d85 236 struct io_fixed_file *files;
31b51510
JA
237};
238
b895c9a6 239struct io_rsrc_node {
05589553
XW
240 struct percpu_ref refs;
241 struct list_head node;
269bbe5f 242 struct list_head rsrc_list;
b895c9a6 243 struct io_rsrc_data *rsrc_data;
4a38aed2 244 struct llist_node llist;
e297822b 245 bool done;
05589553
XW
246};
247
40ae0ff7
PB
248typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
249
b895c9a6 250struct io_rsrc_data {
05f3fb3c
JA
251 struct io_ring_ctx *ctx;
252
2d091d62
PB
253 u64 **tags;
254 unsigned int nr;
40ae0ff7 255 rsrc_put_fn *do_put;
3e942498 256 atomic_t refs;
05f3fb3c 257 struct completion done;
8bad28d8 258 bool quiesce;
05f3fb3c
JA
259};
260
5a2e745d
JA
261struct io_buffer {
262 struct list_head list;
263 __u64 addr;
d1f82808 264 __u32 len;
5a2e745d
JA
265 __u16 bid;
266};
267
21b55dbc
SG
268struct io_restriction {
269 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
270 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
271 u8 sqe_flags_allowed;
272 u8 sqe_flags_required;
7e84e1c7 273 bool registered;
21b55dbc
SG
274};
275
37d1e2e3
JA
276enum {
277 IO_SQ_THREAD_SHOULD_STOP = 0,
278 IO_SQ_THREAD_SHOULD_PARK,
279};
280
534ca6d6
JA
281struct io_sq_data {
282 refcount_t refs;
9e138a48 283 atomic_t park_pending;
09a6f4ef 284 struct mutex lock;
69fb2131
JA
285
286 /* ctx's that are using this sqd */
287 struct list_head ctx_list;
69fb2131 288
534ca6d6
JA
289 struct task_struct *thread;
290 struct wait_queue_head wait;
08369246
XW
291
292 unsigned sq_thread_idle;
37d1e2e3
JA
293 int sq_cpu;
294 pid_t task_pid;
5c2469e0 295 pid_t task_tgid;
37d1e2e3
JA
296
297 unsigned long state;
37d1e2e3 298 struct completion exited;
534ca6d6
JA
299};
300
6dd0be1e 301#define IO_COMPL_BATCH 32
6ff119a6 302#define IO_REQ_CACHE_SIZE 32
bf019da7 303#define IO_REQ_ALLOC_BATCH 8
258b29a9 304
a1ab7b35
PB
305struct io_submit_link {
306 struct io_kiocb *head;
307 struct io_kiocb *last;
308};
309
258b29a9 310struct io_submit_state {
5a158c6b
PB
311 /* inline/task_work completion list, under ->uring_lock */
312 struct io_wq_work_node free_list;
313 /* batch completion logic */
314 struct io_wq_work_list compl_reqs;
a1ab7b35 315 struct io_submit_link link;
258b29a9 316
258b29a9 317 bool plug_started;
4b628aeb 318 bool need_plug;
5a158c6b 319 struct blk_plug plug;
258b29a9
PB
320};
321
2b188cc1 322struct io_ring_ctx {
b52ecf8c 323 /* const or read-mostly hot data */
2b188cc1
JA
324 struct {
325 struct percpu_ref refs;
2b188cc1 326
b52ecf8c 327 struct io_rings *rings;
2b188cc1 328 unsigned int flags;
e1d85334 329 unsigned int compat: 1;
e1d85334
RD
330 unsigned int drain_next: 1;
331 unsigned int eventfd_async: 1;
21b55dbc 332 unsigned int restricted: 1;
f18ee4cf 333 unsigned int off_timeout_used: 1;
10c66904 334 unsigned int drain_active: 1;
b52ecf8c 335 } ____cacheline_aligned_in_smp;
2b188cc1 336
7f1129d2 337 /* submission data */
b52ecf8c 338 struct {
0499e582
PB
339 struct mutex uring_lock;
340
75b28aff
HV
341 /*
342 * Ring buffer of indices into array of io_uring_sqe, which is
343 * mmapped by the application using the IORING_OFF_SQES offset.
344 *
345 * This indirection could e.g. be used to assign fixed
346 * io_uring_sqe entries to operations and only submit them to
347 * the queue when needed.
348 *
349 * The kernel modifies neither the indices array nor the entries
350 * array.
351 */
352 u32 *sq_array;
c7af47cf 353 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
354 unsigned cached_sq_head;
355 unsigned sq_entries;
de0617e4 356 struct list_head defer_list;
7f1129d2
PB
357
358 /*
359 * Fixed resources fast path, should be accessed only under
360 * uring_lock, and updated through io_uring_register(2)
361 */
362 struct io_rsrc_node *rsrc_node;
363 struct io_file_table file_table;
364 unsigned nr_user_files;
365 unsigned nr_user_bufs;
366 struct io_mapped_ubuf **user_bufs;
367
368 struct io_submit_state submit_state;
5262f567 369 struct list_head timeout_list;
ef9dd637 370 struct list_head ltimeout_list;
1d7bb1d5 371 struct list_head cq_overflow_list;
7f1129d2
PB
372 struct xarray io_buffers;
373 struct xarray personalities;
374 u32 pers_next;
375 unsigned sq_thread_idle;
2b188cc1
JA
376 } ____cacheline_aligned_in_smp;
377
d0acdee2 378 /* IRQ completion list, under ->completion_lock */
c2b6c6bc 379 struct io_wq_work_list locked_free_list;
d0acdee2 380 unsigned int locked_free_nr;
3c1a2ead 381
7c30f36a 382 const struct cred *sq_creds; /* cred used for __io_sq_thread() */
534ca6d6
JA
383 struct io_sq_data *sq_data; /* if using sq thread polling */
384
90554200 385 struct wait_queue_head sqo_sq_wait;
69fb2131 386 struct list_head sqd_list;
75b28aff 387
5ed7a37d
PB
388 unsigned long check_cq_overflow;
389
206aefde
JA
390 struct {
391 unsigned cached_cq_tail;
392 unsigned cq_entries;
0499e582 393 struct eventfd_ctx *cq_ev_fd;
0499e582
PB
394 struct wait_queue_head cq_wait;
395 unsigned cq_extra;
396 atomic_t cq_timeouts;
0499e582 397 unsigned cq_last_tm_flush;
206aefde 398 } ____cacheline_aligned_in_smp;
2b188cc1 399
2b188cc1
JA
400 struct {
401 spinlock_t completion_lock;
e94f141b 402
89850fce
JA
403 spinlock_t timeout_lock;
404
def596e9 405 /*
540e32a0 406 * ->iopoll_list is protected by the ctx->uring_lock for
def596e9
JA
407 * io_uring instances that don't use IORING_SETUP_SQPOLL.
408 * For SQPOLL, only the single threaded io_sq_thread() will
409 * manipulate the list, hence no extra locking is needed there.
410 */
5eef4e87 411 struct io_wq_work_list iopoll_list;
78076bb6
JA
412 struct hlist_head *cancel_hash;
413 unsigned cancel_hash_bits;
915b3dde 414 bool poll_multi_queue;
2b188cc1 415 } ____cacheline_aligned_in_smp;
85faa7b8 416
21b55dbc 417 struct io_restriction restrictions;
3c1a2ead 418
b13a8918
PB
419 /* slow path rsrc auxilary data, used by update/register */
420 struct {
421 struct io_rsrc_node *rsrc_backup_node;
422 struct io_mapped_ubuf *dummy_ubuf;
423 struct io_rsrc_data *file_data;
424 struct io_rsrc_data *buf_data;
425
426 struct delayed_work rsrc_put_work;
427 struct llist_head rsrc_put_llist;
428 struct list_head rsrc_ref_list;
429 spinlock_t rsrc_ref_lock;
430 };
431
3c1a2ead 432 /* Keep this last, we don't need it for the fast path */
b986af7e
PB
433 struct {
434 #if defined(CONFIG_UNIX)
435 struct socket *ring_sock;
436 #endif
437 /* hashed buffered write serialization */
438 struct io_wq_hash *hash_map;
439
440 /* Only used for accounting purposes */
441 struct user_struct *user;
442 struct mm_struct *mm_account;
443
444 /* ctx exit and cancelation */
9011bf9a
PB
445 struct llist_head fallback_llist;
446 struct delayed_work fallback_work;
b986af7e
PB
447 struct work_struct exit_work;
448 struct list_head tctx_list;
449 struct completion ref_comp;
450 };
2b188cc1
JA
451};
452
53e043b2
SM
453struct io_uring_task {
454 /* submission side */
09899b19 455 int cached_refs;
53e043b2
SM
456 struct xarray xa;
457 struct wait_queue_head wait;
ee53fb2b
SM
458 const struct io_ring_ctx *last;
459 struct io_wq *io_wq;
53e043b2 460 struct percpu_counter inflight;
b303fe2e 461 atomic_t inflight_tracked;
53e043b2 462 atomic_t in_idle;
53e043b2
SM
463
464 spinlock_t task_lock;
465 struct io_wq_work_list task_list;
53e043b2 466 struct callback_head task_work;
6294f368 467 bool task_running;
53e043b2
SM
468};
469
09bb8394
JA
470/*
471 * First field must be the file pointer in all the
472 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
473 */
221c5eb2
JA
474struct io_poll_iocb {
475 struct file *file;
018043be 476 struct wait_queue_head *head;
221c5eb2 477 __poll_t events;
8c838788 478 bool done;
221c5eb2 479 bool canceled;
392edb45 480 struct wait_queue_entry wait;
221c5eb2
JA
481};
482
9d805892 483struct io_poll_update {
018043be 484 struct file *file;
9d805892
PB
485 u64 old_user_data;
486 u64 new_user_data;
487 __poll_t events;
b69de288
JA
488 bool update_events;
489 bool update_user_data;
018043be
PB
490};
491
b5dba59e
JA
492struct io_close {
493 struct file *file;
b5dba59e 494 int fd;
7df778be 495 u32 file_slot;
b5dba59e
JA
496};
497
ad8a48ac
JA
498struct io_timeout_data {
499 struct io_kiocb *req;
500 struct hrtimer timer;
501 struct timespec64 ts;
502 enum hrtimer_mode mode;
50c1df2b 503 u32 flags;
ad8a48ac
JA
504};
505
8ed8d3c3
JA
506struct io_accept {
507 struct file *file;
508 struct sockaddr __user *addr;
509 int __user *addr_len;
510 int flags;
aaa4db12 511 u32 file_slot;
09952e3e 512 unsigned long nofile;
8ed8d3c3
JA
513};
514
515struct io_sync {
516 struct file *file;
517 loff_t len;
518 loff_t off;
519 int flags;
d63d1b5e 520 int mode;
8ed8d3c3
JA
521};
522
fbf23849
JA
523struct io_cancel {
524 struct file *file;
525 u64 addr;
526};
527
b29472ee
JA
528struct io_timeout {
529 struct file *file;
bfe68a22
PB
530 u32 off;
531 u32 target_seq;
135fcde8 532 struct list_head list;
90cd7e42
PB
533 /* head of the link, used by linked timeouts only */
534 struct io_kiocb *head;
89b263f6
JA
535 /* for linked completions */
536 struct io_kiocb *prev;
b29472ee
JA
537};
538
0bdf7a2d
PB
539struct io_timeout_rem {
540 struct file *file;
541 u64 addr;
9c8e11b3
PB
542
543 /* timeout update */
544 struct timespec64 ts;
545 u32 flags;
f1042b6c 546 bool ltimeout;
0bdf7a2d
PB
547};
548
9adbd45d
JA
549struct io_rw {
550 /* NOTE: kiocb has the file as the first member, so don't do it here */
551 struct kiocb kiocb;
552 u64 addr;
553 u64 len;
554};
555
3fbb51c1
JA
556struct io_connect {
557 struct file *file;
558 struct sockaddr __user *addr;
559 int addr_len;
560};
561
e47293fd
JA
562struct io_sr_msg {
563 struct file *file;
fddaface 564 union {
4af3417a
PB
565 struct compat_msghdr __user *umsg_compat;
566 struct user_msghdr __user *umsg;
567 void __user *buf;
fddaface 568 };
e47293fd 569 int msg_flags;
bcda7baa 570 int bgid;
fddaface 571 size_t len;
e47293fd
JA
572};
573
15b71abe
JA
574struct io_open {
575 struct file *file;
576 int dfd;
b9445598 577 u32 file_slot;
15b71abe 578 struct filename *filename;
c12cedf2 579 struct open_how how;
4022e7af 580 unsigned long nofile;
15b71abe
JA
581};
582
269bbe5f 583struct io_rsrc_update {
05f3fb3c
JA
584 struct file *file;
585 u64 arg;
586 u32 nr_args;
587 u32 offset;
588};
589
4840e418
JA
590struct io_fadvise {
591 struct file *file;
592 u64 offset;
593 u32 len;
594 u32 advice;
595};
596
c1ca757b
JA
597struct io_madvise {
598 struct file *file;
599 u64 addr;
600 u32 len;
601 u32 advice;
602};
603
3e4827b0
JA
604struct io_epoll {
605 struct file *file;
606 int epfd;
607 int op;
608 int fd;
609 struct epoll_event event;
e47293fd
JA
610};
611
7d67af2c
PB
612struct io_splice {
613 struct file *file_out;
614 struct file *file_in;
615 loff_t off_out;
616 loff_t off_in;
617 u64 len;
618 unsigned int flags;
619};
620
ddf0322d
JA
621struct io_provide_buf {
622 struct file *file;
623 __u64 addr;
38134ada 624 __u32 len;
ddf0322d
JA
625 __u32 bgid;
626 __u16 nbufs;
627 __u16 bid;
628};
629
1d9e1288
BM
630struct io_statx {
631 struct file *file;
632 int dfd;
633 unsigned int mask;
634 unsigned int flags;
e62753e4 635 const char __user *filename;
1d9e1288
BM
636 struct statx __user *buffer;
637};
638
36f4fa68
JA
639struct io_shutdown {
640 struct file *file;
641 int how;
642};
643
80a261fd
JA
644struct io_rename {
645 struct file *file;
646 int old_dfd;
647 int new_dfd;
648 struct filename *oldpath;
649 struct filename *newpath;
650 int flags;
651};
652
14a1143b
JA
653struct io_unlink {
654 struct file *file;
655 int dfd;
656 int flags;
657 struct filename *filename;
658};
659
e34a02dc
DK
660struct io_mkdir {
661 struct file *file;
662 int dfd;
663 umode_t mode;
664 struct filename *filename;
665};
666
7a8721f8
DK
667struct io_symlink {
668 struct file *file;
669 int new_dfd;
670 struct filename *oldpath;
671 struct filename *newpath;
672};
673
cf30da90
DK
674struct io_hardlink {
675 struct file *file;
676 int old_dfd;
677 int new_dfd;
678 struct filename *oldpath;
679 struct filename *newpath;
680 int flags;
681};
682
f499a021
JA
683struct io_async_connect {
684 struct sockaddr_storage address;
685};
686
03b1230c
JA
687struct io_async_msghdr {
688 struct iovec fast_iov[UIO_FASTIOV];
257e84a5
PB
689 /* points to an allocated iov, if NULL we use fast_iov instead */
690 struct iovec *free_iov;
03b1230c
JA
691 struct sockaddr __user *uaddr;
692 struct msghdr msg;
b537916c 693 struct sockaddr_storage addr;
03b1230c
JA
694};
695
f67676d1
JA
696struct io_async_rw {
697 struct iovec fast_iov[UIO_FASTIOV];
ff6165b2
JA
698 const struct iovec *free_iovec;
699 struct iov_iter iter;
cd658695 700 struct iov_iter_state iter_state;
227c0c96 701 size_t bytes_done;
bcf5a063 702 struct wait_page_queue wpq;
f67676d1
JA
703};
704
6b47ee6e
PB
705enum {
706 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
707 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
708 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
709 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
710 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
bcda7baa 711 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
6b47ee6e 712
dddca226 713 /* first byte is taken by user flags, shift it to not overlap */
93d2bcd2 714 REQ_F_FAIL_BIT = 8,
6b47ee6e
PB
715 REQ_F_INFLIGHT_BIT,
716 REQ_F_CUR_POS_BIT,
717 REQ_F_NOWAIT_BIT,
6b47ee6e 718 REQ_F_LINK_TIMEOUT_BIT,
99bc4c38 719 REQ_F_NEED_CLEANUP_BIT,
d7718a9d 720 REQ_F_POLLED_BIT,
bcda7baa 721 REQ_F_BUFFER_SELECTED_BIT,
e342c807 722 REQ_F_COMPLETE_INLINE_BIT,
230d50d4 723 REQ_F_REISSUE_BIT,
b8e64b53 724 REQ_F_CREDS_BIT,
20e60a38 725 REQ_F_REFCOUNT_BIT,
4d13d1a4 726 REQ_F_ARM_LTIMEOUT_BIT,
d886e185 727 REQ_F_ASYNC_DATA_BIT,
7b29f92d 728 /* keep async read/write and isreg together and in order */
b191e2df
PB
729 REQ_F_NOWAIT_READ_BIT,
730 REQ_F_NOWAIT_WRITE_BIT,
7b29f92d 731 REQ_F_ISREG_BIT,
84557871
JA
732
733 /* not a real bit, just to check we're not overflowing the space */
734 __REQ_F_LAST_BIT,
6b47ee6e
PB
735};
736
737enum {
738 /* ctx owns file */
739 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
740 /* drain existing IO first */
741 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
742 /* linked sqes */
743 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
744 /* doesn't sever on completion < 0 */
745 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
746 /* IOSQE_ASYNC */
747 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
bcda7baa
JA
748 /* IOSQE_BUFFER_SELECT */
749 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
6b47ee6e 750
6b47ee6e 751 /* fail rest of links */
93d2bcd2 752 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
b05a1bcd 753 /* on inflight list, should be cancelled and waited on exit reliably */
6b47ee6e
PB
754 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
755 /* read/write uses file position */
756 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
757 /* must not punt to workers */
758 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
900fad45 759 /* has or had linked timeout */
6b47ee6e 760 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
99bc4c38
PB
761 /* needs cleanup */
762 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
d7718a9d
JA
763 /* already went through poll handler */
764 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
bcda7baa
JA
765 /* buffer already selected */
766 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
e342c807
PB
767 /* completion is deferred through io_comp_state */
768 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
230d50d4
JA
769 /* caller should reissue async */
770 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
7b29f92d 771 /* supports async reads */
b191e2df 772 REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
7b29f92d 773 /* supports async writes */
b191e2df 774 REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
7b29f92d
JA
775 /* regular file */
776 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
b8e64b53
PB
777 /* has creds assigned */
778 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
20e60a38
PB
779 /* skip refcounting if not set */
780 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
4d13d1a4
PB
781 /* there is a linked timeout that has to be armed */
782 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
d886e185
PB
783 /* ->async_data allocated */
784 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
d7718a9d
JA
785};
786
787struct async_poll {
788 struct io_poll_iocb poll;
807abcb0 789 struct io_poll_iocb *double_poll;
6b47ee6e
PB
790};
791
f237c30a 792typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
5b0a6acc 793
7cbf1722 794struct io_task_work {
5b0a6acc
PB
795 union {
796 struct io_wq_work_node node;
797 struct llist_node fallback_node;
798 };
799 io_req_tw_func_t func;
7cbf1722
JA
800};
801
992da01a
PB
802enum {
803 IORING_RSRC_FILE = 0,
804 IORING_RSRC_BUFFER = 1,
805};
806
09bb8394
JA
807/*
808 * NOTE! Each of the iocb union members has the file pointer
809 * as the first entry in their struct definition. So you can
810 * access the file pointer through any of the sub-structs,
811 * or directly as just 'ki_filp' in this struct.
812 */
2b188cc1 813struct io_kiocb {
221c5eb2 814 union {
09bb8394 815 struct file *file;
9adbd45d 816 struct io_rw rw;
221c5eb2 817 struct io_poll_iocb poll;
9d805892 818 struct io_poll_update poll_update;
8ed8d3c3
JA
819 struct io_accept accept;
820 struct io_sync sync;
fbf23849 821 struct io_cancel cancel;
b29472ee 822 struct io_timeout timeout;
0bdf7a2d 823 struct io_timeout_rem timeout_rem;
3fbb51c1 824 struct io_connect connect;
e47293fd 825 struct io_sr_msg sr_msg;
15b71abe 826 struct io_open open;
b5dba59e 827 struct io_close close;
269bbe5f 828 struct io_rsrc_update rsrc_update;
4840e418 829 struct io_fadvise fadvise;
c1ca757b 830 struct io_madvise madvise;
3e4827b0 831 struct io_epoll epoll;
7d67af2c 832 struct io_splice splice;
ddf0322d 833 struct io_provide_buf pbuf;
1d9e1288 834 struct io_statx statx;
36f4fa68 835 struct io_shutdown shutdown;
80a261fd 836 struct io_rename rename;
14a1143b 837 struct io_unlink unlink;
e34a02dc 838 struct io_mkdir mkdir;
7a8721f8 839 struct io_symlink symlink;
cf30da90 840 struct io_hardlink hardlink;
221c5eb2 841 };
2b188cc1 842
d625c6ee 843 u8 opcode;
65a6543d
XW
844 /* polled IO has completed */
845 u8 iopoll_completed;
4f4eeba8 846 u16 buf_index;
d17e56eb
PB
847 unsigned int flags;
848
849 u64 user_data;
9cf7c104 850 u32 result;
d17e56eb 851 u32 cflags;
4f4eeba8 852
010e8e6b 853 struct io_ring_ctx *ctx;
010e8e6b 854 struct task_struct *task;
d7718a9d 855
269bbe5f 856 struct percpu_ref *fixed_rsrc_refs;
d886e185
PB
857 /* store used ubuf, so we can prevent reloading */
858 struct io_mapped_ubuf *imu;
fcb323cc 859
7e3709d5 860 /* used by request caches, completion batching and iopoll */
ef05d9eb 861 struct io_wq_work_node comp_list;
d17e56eb 862 atomic_t refs;
7e3709d5 863 struct io_kiocb *link;
5b0a6acc 864 struct io_task_work io_task_work;
010e8e6b
PB
865 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
866 struct hlist_node hash_node;
7e3709d5 867 /* internal polling, see IORING_FEAT_FAST_POLL */
010e8e6b 868 struct async_poll *apoll;
d886e185
PB
869 /* opcode allocated if it needs to store data for async defer */
870 void *async_data;
ef05d9eb 871 struct io_wq_work work;
7e3709d5 872 /* custom credentials, valid IFF REQ_F_CREDS is set */
ef05d9eb 873 const struct cred *creds;
7e3709d5 874 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
30d51dd4 875 struct io_buffer *kbuf;
2b188cc1 876};
05589553 877
13bf43f5
PB
878struct io_tctx_node {
879 struct list_head ctx_node;
880 struct task_struct *task;
13bf43f5
PB
881 struct io_ring_ctx *ctx;
882};
883
27dc8338
PB
884struct io_defer_entry {
885 struct list_head list;
886 struct io_kiocb *req;
9cf7c104 887 u32 seq;
2b188cc1
JA
888};
889
d3656344 890struct io_op_def {
d3656344
JA
891 /* needs req->file assigned */
892 unsigned needs_file : 1;
6d63416d
PB
893 /* should block plug */
894 unsigned plug : 1;
d3656344
JA
895 /* hash wq insertion if file is a regular file */
896 unsigned hash_reg_file : 1;
897 /* unbound wq insertion if file is a non-regular file */
898 unsigned unbound_nonreg_file : 1;
8a72758c
JA
899 /* set if opcode supports polled "wait" */
900 unsigned pollin : 1;
901 unsigned pollout : 1;
bcda7baa
JA
902 /* op supports buffer selection */
903 unsigned buffer_select : 1;
26f0505a
PB
904 /* do prep async if is going to be punted */
905 unsigned needs_async_setup : 1;
6d63416d
PB
906 /* opcode is not supported by this kernel */
907 unsigned not_supported : 1;
e8c2bc1f
JA
908 /* size of async data needed, if any */
909 unsigned short async_size;
d3656344
JA
910};
911
0918682b 912static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
913 [IORING_OP_NOP] = {},
914 [IORING_OP_READV] = {
d3656344
JA
915 .needs_file = 1,
916 .unbound_nonreg_file = 1,
8a72758c 917 .pollin = 1,
4d954c25 918 .buffer_select = 1,
26f0505a 919 .needs_async_setup = 1,
27926b68 920 .plug = 1,
e8c2bc1f 921 .async_size = sizeof(struct io_async_rw),
d3656344 922 },
0463b6c5 923 [IORING_OP_WRITEV] = {
d3656344
JA
924 .needs_file = 1,
925 .hash_reg_file = 1,
926 .unbound_nonreg_file = 1,
8a72758c 927 .pollout = 1,
26f0505a 928 .needs_async_setup = 1,
27926b68 929 .plug = 1,
e8c2bc1f 930 .async_size = sizeof(struct io_async_rw),
d3656344 931 },
0463b6c5 932 [IORING_OP_FSYNC] = {
d3656344
JA
933 .needs_file = 1,
934 },
0463b6c5 935 [IORING_OP_READ_FIXED] = {
d3656344
JA
936 .needs_file = 1,
937 .unbound_nonreg_file = 1,
8a72758c 938 .pollin = 1,
27926b68 939 .plug = 1,
e8c2bc1f 940 .async_size = sizeof(struct io_async_rw),
d3656344 941 },
0463b6c5 942 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
943 .needs_file = 1,
944 .hash_reg_file = 1,
945 .unbound_nonreg_file = 1,
8a72758c 946 .pollout = 1,
27926b68 947 .plug = 1,
e8c2bc1f 948 .async_size = sizeof(struct io_async_rw),
d3656344 949 },
0463b6c5 950 [IORING_OP_POLL_ADD] = {
d3656344
JA
951 .needs_file = 1,
952 .unbound_nonreg_file = 1,
953 },
0463b6c5
PB
954 [IORING_OP_POLL_REMOVE] = {},
955 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344
JA
956 .needs_file = 1,
957 },
0463b6c5 958 [IORING_OP_SENDMSG] = {
d3656344
JA
959 .needs_file = 1,
960 .unbound_nonreg_file = 1,
8a72758c 961 .pollout = 1,
26f0505a 962 .needs_async_setup = 1,
e8c2bc1f 963 .async_size = sizeof(struct io_async_msghdr),
d3656344 964 },
0463b6c5 965 [IORING_OP_RECVMSG] = {
d3656344
JA
966 .needs_file = 1,
967 .unbound_nonreg_file = 1,
8a72758c 968 .pollin = 1,
52de1fe1 969 .buffer_select = 1,
26f0505a 970 .needs_async_setup = 1,
e8c2bc1f 971 .async_size = sizeof(struct io_async_msghdr),
d3656344 972 },
0463b6c5 973 [IORING_OP_TIMEOUT] = {
e8c2bc1f 974 .async_size = sizeof(struct io_timeout_data),
d3656344 975 },
9c8e11b3
PB
976 [IORING_OP_TIMEOUT_REMOVE] = {
977 /* used by timeout updates' prep() */
9c8e11b3 978 },
0463b6c5 979 [IORING_OP_ACCEPT] = {
d3656344
JA
980 .needs_file = 1,
981 .unbound_nonreg_file = 1,
8a72758c 982 .pollin = 1,
d3656344 983 },
0463b6c5
PB
984 [IORING_OP_ASYNC_CANCEL] = {},
985 [IORING_OP_LINK_TIMEOUT] = {
e8c2bc1f 986 .async_size = sizeof(struct io_timeout_data),
d3656344 987 },
0463b6c5 988 [IORING_OP_CONNECT] = {
d3656344
JA
989 .needs_file = 1,
990 .unbound_nonreg_file = 1,
8a72758c 991 .pollout = 1,
26f0505a 992 .needs_async_setup = 1,
e8c2bc1f 993 .async_size = sizeof(struct io_async_connect),
d3656344 994 },
0463b6c5 995 [IORING_OP_FALLOCATE] = {
d3656344 996 .needs_file = 1,
d3656344 997 },
44526bed
JA
998 [IORING_OP_OPENAT] = {},
999 [IORING_OP_CLOSE] = {},
1000 [IORING_OP_FILES_UPDATE] = {},
1001 [IORING_OP_STATX] = {},
0463b6c5 1002 [IORING_OP_READ] = {
3a6820f2
JA
1003 .needs_file = 1,
1004 .unbound_nonreg_file = 1,
8a72758c 1005 .pollin = 1,
bcda7baa 1006 .buffer_select = 1,
27926b68 1007 .plug = 1,
e8c2bc1f 1008 .async_size = sizeof(struct io_async_rw),
3a6820f2 1009 },
0463b6c5 1010 [IORING_OP_WRITE] = {
3a6820f2 1011 .needs_file = 1,
7b3188e7 1012 .hash_reg_file = 1,
3a6820f2 1013 .unbound_nonreg_file = 1,
8a72758c 1014 .pollout = 1,
27926b68 1015 .plug = 1,
e8c2bc1f 1016 .async_size = sizeof(struct io_async_rw),
3a6820f2 1017 },
0463b6c5 1018 [IORING_OP_FADVISE] = {
4840e418 1019 .needs_file = 1,
c1ca757b 1020 },
44526bed 1021 [IORING_OP_MADVISE] = {},
0463b6c5 1022 [IORING_OP_SEND] = {
fddaface
JA
1023 .needs_file = 1,
1024 .unbound_nonreg_file = 1,
8a72758c 1025 .pollout = 1,
fddaface 1026 },
0463b6c5 1027 [IORING_OP_RECV] = {
fddaface
JA
1028 .needs_file = 1,
1029 .unbound_nonreg_file = 1,
8a72758c 1030 .pollin = 1,
bcda7baa 1031 .buffer_select = 1,
fddaface 1032 },
0463b6c5 1033 [IORING_OP_OPENAT2] = {
cebdb986 1034 },
3e4827b0
JA
1035 [IORING_OP_EPOLL_CTL] = {
1036 .unbound_nonreg_file = 1,
3e4827b0 1037 },
7d67af2c
PB
1038 [IORING_OP_SPLICE] = {
1039 .needs_file = 1,
1040 .hash_reg_file = 1,
1041 .unbound_nonreg_file = 1,
ddf0322d
JA
1042 },
1043 [IORING_OP_PROVIDE_BUFFERS] = {},
067524e9 1044 [IORING_OP_REMOVE_BUFFERS] = {},
f2a8d5c7
PB
1045 [IORING_OP_TEE] = {
1046 .needs_file = 1,
1047 .hash_reg_file = 1,
1048 .unbound_nonreg_file = 1,
1049 },
36f4fa68
JA
1050 [IORING_OP_SHUTDOWN] = {
1051 .needs_file = 1,
1052 },
44526bed
JA
1053 [IORING_OP_RENAMEAT] = {},
1054 [IORING_OP_UNLINKAT] = {},
e34a02dc 1055 [IORING_OP_MKDIRAT] = {},
7a8721f8 1056 [IORING_OP_SYMLINKAT] = {},
cf30da90 1057 [IORING_OP_LINKAT] = {},
d3656344
JA
1058};
1059
0756a869
PB
1060/* requests with any of those set should undergo io_disarm_next() */
1061#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1062
7a612350 1063static bool io_disarm_next(struct io_kiocb *req);
eef51daa 1064static void io_uring_del_tctx_node(unsigned long index);
9936c7c2
PB
1065static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1066 struct task_struct *task,
3dd0c97a 1067 bool cancel_all);
78cc687b 1068static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1ffc5422 1069
d4d19c19 1070static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 1071 s32 res, u32 cflags);
ec9c02ad 1072static void io_put_req(struct io_kiocb *req);
91c2f697 1073static void io_put_req_deferred(struct io_kiocb *req);
c7dae4ba 1074static void io_dismantle_req(struct io_kiocb *req);
94ae5e77 1075static void io_queue_linked_timeout(struct io_kiocb *req);
fdecb662 1076static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 1077 struct io_uring_rsrc_update2 *up,
98f0b3b4 1078 unsigned nr_args);
68fb8979 1079static void io_clean_op(struct io_kiocb *req);
ac177053 1080static struct file *io_file_get(struct io_ring_ctx *ctx,
8371adf5 1081 struct io_kiocb *req, int fd, bool fixed);
c5eef2b9 1082static void __io_queue_sqe(struct io_kiocb *req);
269bbe5f 1083static void io_rsrc_put_work(struct work_struct *work);
de0617e4 1084
907d1df3 1085static void io_req_task_queue(struct io_kiocb *req);
c450178d 1086static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
179ae0d1 1087static int io_req_prep_async(struct io_kiocb *req);
de0617e4 1088
b9445598
PB
1089static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1090 unsigned int issue_flags, u32 slot_index);
7df778be
PB
1091static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1092
f1042b6c 1093static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
b9445598 1094
2b188cc1
JA
1095static struct kmem_cache *req_cachep;
1096
0918682b 1097static const struct file_operations io_uring_fops;
2b188cc1
JA
1098
1099struct sock *io_uring_get_socket(struct file *file)
1100{
1101#if defined(CONFIG_UNIX)
1102 if (file->f_op == &io_uring_fops) {
1103 struct io_ring_ctx *ctx = file->private_data;
1104
1105 return ctx->ring_sock->sk;
1106 }
1107#endif
1108 return NULL;
1109}
1110EXPORT_SYMBOL(io_uring_get_socket);
1111
f237c30a
PB
1112static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1113{
1114 if (!*locked) {
1115 mutex_lock(&ctx->uring_lock);
1116 *locked = true;
1117 }
1118}
1119
f2f87370
PB
1120#define io_for_each_link(pos, head) \
1121 for (pos = (head); pos; pos = pos->link)
1122
21c843d5
PB
1123/*
1124 * Shamelessly stolen from the mm implementation of page reference checking,
1125 * see commit f958d7b528b1 for details.
1126 */
1127#define req_ref_zero_or_close_to_overflow(req) \
1128 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1129
1130static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1131{
20e60a38 1132 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1133 return atomic_inc_not_zero(&req->refs);
1134}
1135
21c843d5
PB
1136static inline bool req_ref_put_and_test(struct io_kiocb *req)
1137{
20e60a38
PB
1138 if (likely(!(req->flags & REQ_F_REFCOUNT)))
1139 return true;
1140
21c843d5
PB
1141 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1142 return atomic_dec_and_test(&req->refs);
1143}
1144
1145static inline void req_ref_put(struct io_kiocb *req)
1146{
20e60a38 1147 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1148 WARN_ON_ONCE(req_ref_put_and_test(req));
1149}
1150
1151static inline void req_ref_get(struct io_kiocb *req)
1152{
20e60a38 1153 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1154 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1155 atomic_inc(&req->refs);
1156}
1157
c450178d
PB
1158static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1159{
6f33b0bc 1160 if (!wq_list_empty(&ctx->submit_state.compl_reqs))
c450178d
PB
1161 __io_submit_flush_completions(ctx);
1162}
1163
48dcd38d 1164static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
20e60a38
PB
1165{
1166 if (!(req->flags & REQ_F_REFCOUNT)) {
1167 req->flags |= REQ_F_REFCOUNT;
48dcd38d 1168 atomic_set(&req->refs, nr);
20e60a38
PB
1169 }
1170}
1171
48dcd38d
PB
1172static inline void io_req_set_refcount(struct io_kiocb *req)
1173{
1174 __io_req_set_refcount(req, 1);
1175}
1176
b895c9a6 1177static inline void io_req_set_rsrc_node(struct io_kiocb *req)
36f72fe2
PB
1178{
1179 struct io_ring_ctx *ctx = req->ctx;
1180
269bbe5f 1181 if (!req->fixed_rsrc_refs) {
a7f0ed5a 1182 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
269bbe5f 1183 percpu_ref_get(req->fixed_rsrc_refs);
36f72fe2
PB
1184 }
1185}
1186
f70865db
PB
1187static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1188{
1189 bool got = percpu_ref_tryget(ref);
1190
1191 /* already at zero, wait for ->release() */
1192 if (!got)
1193 wait_for_completion(compl);
1194 percpu_ref_resurrect(ref);
1195 if (got)
1196 percpu_ref_put(ref);
1197}
1198
3dd0c97a
PB
1199static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1200 bool cancel_all)
08d23634
PB
1201{
1202 struct io_kiocb *req;
1203
68207680 1204 if (task && head->task != task)
08d23634 1205 return false;
3dd0c97a 1206 if (cancel_all)
08d23634
PB
1207 return true;
1208
1209 io_for_each_link(req, head) {
b05a1bcd 1210 if (req->flags & REQ_F_INFLIGHT)
02a13674 1211 return true;
08d23634
PB
1212 }
1213 return false;
1214}
1215
d886e185
PB
1216static inline bool req_has_async_data(struct io_kiocb *req)
1217{
1218 return req->flags & REQ_F_ASYNC_DATA;
1219}
1220
93d2bcd2 1221static inline void req_set_fail(struct io_kiocb *req)
c40f6379 1222{
93d2bcd2 1223 req->flags |= REQ_F_FAIL;
c40f6379 1224}
4a38aed2 1225
a8295b98
HX
1226static inline void req_fail_link_node(struct io_kiocb *req, int res)
1227{
1228 req_set_fail(req);
1229 req->result = res;
1230}
1231
c072481d 1232static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
2b188cc1
JA
1233{
1234 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1235
0f158b4c 1236 complete(&ctx->ref_comp);
2b188cc1
JA
1237}
1238
8eb7e2d0
PB
1239static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1240{
1241 return !req->timeout.off;
1242}
1243
c072481d 1244static __cold void io_fallback_req_func(struct work_struct *work)
f56165e6
PB
1245{
1246 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1247 fallback_work.work);
1248 struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1249 struct io_kiocb *req, *tmp;
f237c30a 1250 bool locked = false;
f56165e6
PB
1251
1252 percpu_ref_get(&ctx->refs);
1253 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
f237c30a 1254 req->io_task_work.func(req, &locked);
5636c00d 1255
f237c30a 1256 if (locked) {
c450178d 1257 io_submit_flush_completions(ctx);
f237c30a
PB
1258 mutex_unlock(&ctx->uring_lock);
1259 }
f56165e6
PB
1260 percpu_ref_put(&ctx->refs);
1261}
1262
c072481d 1263static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
2b188cc1
JA
1264{
1265 struct io_ring_ctx *ctx;
78076bb6 1266 int hash_bits;
2b188cc1
JA
1267
1268 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1269 if (!ctx)
1270 return NULL;
1271
78076bb6
JA
1272 /*
1273 * Use 5 bits less than the max cq entries, that should give us around
1274 * 32 entries per hash list if totally full and uniformly spread.
1275 */
1276 hash_bits = ilog2(p->cq_entries);
1277 hash_bits -= 5;
1278 if (hash_bits <= 0)
1279 hash_bits = 1;
1280 ctx->cancel_hash_bits = hash_bits;
1281 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1282 GFP_KERNEL);
1283 if (!ctx->cancel_hash)
1284 goto err;
1285 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1286
6224843d
PB
1287 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1288 if (!ctx->dummy_ubuf)
1289 goto err;
1290 /* set invalid range, so io_import_fixed() fails meeting it */
1291 ctx->dummy_ubuf->ubuf = -1UL;
1292
21482896 1293 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
1294 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1295 goto err;
2b188cc1
JA
1296
1297 ctx->flags = p->flags;
90554200 1298 init_waitqueue_head(&ctx->sqo_sq_wait);
69fb2131 1299 INIT_LIST_HEAD(&ctx->sqd_list);
1d7bb1d5 1300 INIT_LIST_HEAD(&ctx->cq_overflow_list);
0f158b4c 1301 init_completion(&ctx->ref_comp);
9e15c3a0 1302 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
61cf9370 1303 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
2b188cc1 1304 mutex_init(&ctx->uring_lock);
311997b3 1305 init_waitqueue_head(&ctx->cq_wait);
2b188cc1 1306 spin_lock_init(&ctx->completion_lock);
89850fce 1307 spin_lock_init(&ctx->timeout_lock);
5eef4e87 1308 INIT_WQ_LIST(&ctx->iopoll_list);
de0617e4 1309 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 1310 INIT_LIST_HEAD(&ctx->timeout_list);
ef9dd637 1311 INIT_LIST_HEAD(&ctx->ltimeout_list);
d67d2263
BM
1312 spin_lock_init(&ctx->rsrc_ref_lock);
1313 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
269bbe5f
BM
1314 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1315 init_llist_head(&ctx->rsrc_put_llist);
13bf43f5 1316 INIT_LIST_HEAD(&ctx->tctx_list);
c2b6c6bc
PB
1317 ctx->submit_state.free_list.next = NULL;
1318 INIT_WQ_LIST(&ctx->locked_free_list);
9011bf9a 1319 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
6f33b0bc 1320 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
2b188cc1 1321 return ctx;
206aefde 1322err:
6224843d 1323 kfree(ctx->dummy_ubuf);
78076bb6 1324 kfree(ctx->cancel_hash);
206aefde
JA
1325 kfree(ctx);
1326 return NULL;
2b188cc1
JA
1327}
1328
8f6ed49a
PB
1329static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1330{
1331 struct io_rings *r = ctx->rings;
1332
1333 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1334 ctx->cq_extra--;
1335}
1336
9cf7c104 1337static bool req_need_defer(struct io_kiocb *req, u32 seq)
7adf4eaf 1338{
2bc9930e
JA
1339 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1340 struct io_ring_ctx *ctx = req->ctx;
a197f664 1341
8f6ed49a 1342 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
2bc9930e 1343 }
de0617e4 1344
9d858b21 1345 return false;
de0617e4
JA
1346}
1347
c97d8a0f
PB
1348#define FFS_ASYNC_READ 0x1UL
1349#define FFS_ASYNC_WRITE 0x2UL
1350#ifdef CONFIG_64BIT
1351#define FFS_ISREG 0x4UL
1352#else
1353#define FFS_ISREG 0x0UL
1354#endif
1355#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
1356
1357static inline bool io_req_ffs_set(struct io_kiocb *req)
1358{
1359 return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
1360}
1361
c072481d 1362static inline void io_req_track_inflight(struct io_kiocb *req)
ce3d5aae 1363{
ce3d5aae 1364 if (!(req->flags & REQ_F_INFLIGHT)) {
ce3d5aae 1365 req->flags |= REQ_F_INFLIGHT;
b303fe2e 1366 atomic_inc(&current->io_uring->inflight_tracked);
ce3d5aae
PB
1367 }
1368}
1369
906c6caa
PB
1370static inline void io_unprep_linked_timeout(struct io_kiocb *req)
1371{
1372 req->flags &= ~REQ_F_LINK_TIMEOUT;
1373}
1374
fd08e530
PB
1375static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1376{
906c6caa
PB
1377 if (WARN_ON_ONCE(!req->link))
1378 return NULL;
1379
4d13d1a4
PB
1380 req->flags &= ~REQ_F_ARM_LTIMEOUT;
1381 req->flags |= REQ_F_LINK_TIMEOUT;
fd08e530
PB
1382
1383 /* linked timeouts should have two refs once prep'ed */
48dcd38d 1384 io_req_set_refcount(req);
4d13d1a4
PB
1385 __io_req_set_refcount(req->link, 2);
1386 return req->link;
fd08e530
PB
1387}
1388
1389static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1390{
4d13d1a4 1391 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
fd08e530
PB
1392 return NULL;
1393 return __io_prep_linked_timeout(req);
1394}
1395
1e6fa521
JA
1396static void io_prep_async_work(struct io_kiocb *req)
1397{
1398 const struct io_op_def *def = &io_op_defs[req->opcode];
1e6fa521
JA
1399 struct io_ring_ctx *ctx = req->ctx;
1400
b8e64b53
PB
1401 if (!(req->flags & REQ_F_CREDS)) {
1402 req->flags |= REQ_F_CREDS;
c10d1f98 1403 req->creds = get_current_cred();
b8e64b53 1404 }
003e8dcc 1405
e1d675df
PB
1406 req->work.list.next = NULL;
1407 req->work.flags = 0;
feaadc4f
PB
1408 if (req->flags & REQ_F_FORCE_ASYNC)
1409 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1410
1e6fa521
JA
1411 if (req->flags & REQ_F_ISREG) {
1412 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1413 io_wq_hash_work(&req->work, file_inode(req->file));
4b982bd0 1414 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1e6fa521
JA
1415 if (def->unbound_nonreg_file)
1416 req->work.flags |= IO_WQ_WORK_UNBOUND;
1417 }
e1d675df
PB
1418
1419 switch (req->opcode) {
1420 case IORING_OP_SPLICE:
1421 case IORING_OP_TEE:
e1d675df
PB
1422 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1423 req->work.flags |= IO_WQ_WORK_UNBOUND;
1424 break;
1425 }
561fb04a 1426}
cccf0ee8 1427
cbdcb435 1428static void io_prep_async_link(struct io_kiocb *req)
561fb04a 1429{
cbdcb435 1430 struct io_kiocb *cur;
54a91f3b 1431
44eff40a
PB
1432 if (req->flags & REQ_F_LINK_TIMEOUT) {
1433 struct io_ring_ctx *ctx = req->ctx;
1434
79ebeaee 1435 spin_lock(&ctx->completion_lock);
44eff40a
PB
1436 io_for_each_link(cur, req)
1437 io_prep_async_work(cur);
79ebeaee 1438 spin_unlock(&ctx->completion_lock);
44eff40a
PB
1439 } else {
1440 io_for_each_link(cur, req)
1441 io_prep_async_work(cur);
1442 }
561fb04a
JA
1443}
1444
fff4e40e
PB
1445static inline void io_req_add_compl_list(struct io_kiocb *req)
1446{
1447 struct io_submit_state *state = &req->ctx->submit_state;
1448
1449 wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1450}
1451
f237c30a 1452static void io_queue_async_work(struct io_kiocb *req, bool *locked)
561fb04a 1453{
a197f664 1454 struct io_ring_ctx *ctx = req->ctx;
cbdcb435 1455 struct io_kiocb *link = io_prep_linked_timeout(req);
5aa75ed5 1456 struct io_uring_task *tctx = req->task->io_uring;
561fb04a 1457
f237c30a
PB
1458 /* must not take the lock, NULL it as a precaution */
1459 locked = NULL;
1460
3bfe6106
JA
1461 BUG_ON(!tctx);
1462 BUG_ON(!tctx->io_wq);
561fb04a 1463
cbdcb435
PB
1464 /* init ->work of the whole link before punting */
1465 io_prep_async_link(req);
991468dc
JA
1466
1467 /*
1468 * Not expected to happen, but if we do have a bug where this _can_
1469 * happen, catch it here and ensure the request is marked as
1470 * canceled. That will make io-wq go through the usual work cancel
1471 * procedure rather than attempt to run this request (or create a new
1472 * worker for it).
1473 */
1474 if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1475 req->work.flags |= IO_WQ_WORK_CANCEL;
1476
d07f1e8a
PB
1477 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1478 &req->work, req->flags);
ebf93667 1479 io_wq_enqueue(tctx->io_wq, &req->work);
7271ef3a
JA
1480 if (link)
1481 io_queue_linked_timeout(link);
cbdcb435
PB
1482}
1483
1ee4160c 1484static void io_kill_timeout(struct io_kiocb *req, int status)
8c855885 1485 __must_hold(&req->ctx->completion_lock)
89850fce 1486 __must_hold(&req->ctx->timeout_lock)
5262f567 1487{
e8c2bc1f 1488 struct io_timeout_data *io = req->async_data;
5262f567 1489
fd9c7bc5 1490 if (hrtimer_try_to_cancel(&io->timer) != -1) {
2ae2eb9d
PB
1491 if (status)
1492 req_set_fail(req);
01cec8c1
PB
1493 atomic_set(&req->ctx->cq_timeouts,
1494 atomic_read(&req->ctx->cq_timeouts) + 1);
135fcde8 1495 list_del_init(&req->timeout.list);
d4d19c19 1496 io_cqring_fill_event(req->ctx, req->user_data, status, 0);
91c2f697 1497 io_put_req_deferred(req);
5262f567
JA
1498 }
1499}
1500
c072481d 1501static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
de0617e4 1502{
441b8a78 1503 while (!list_empty(&ctx->defer_list)) {
27dc8338
PB
1504 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1505 struct io_defer_entry, list);
de0617e4 1506
9cf7c104 1507 if (req_need_defer(de->req, de->seq))
04518945 1508 break;
27dc8338 1509 list_del_init(&de->list);
907d1df3 1510 io_req_task_queue(de->req);
27dc8338 1511 kfree(de);
441b8a78 1512 }
04518945
PB
1513}
1514
c072481d 1515static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
89850fce 1516 __must_hold(&ctx->completion_lock)
de0617e4 1517{
441b8a78 1518 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
f010505b 1519
79ebeaee 1520 spin_lock_irq(&ctx->timeout_lock);
f18ee4cf 1521 while (!list_empty(&ctx->timeout_list)) {
f010505b 1522 u32 events_needed, events_got;
360428f8 1523 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
135fcde8 1524 struct io_kiocb, timeout.list);
de0617e4 1525
8eb7e2d0 1526 if (io_is_timeout_noseq(req))
360428f8 1527 break;
f010505b
MDG
1528
1529 /*
1530 * Since seq can easily wrap around over time, subtract
1531 * the last seq at which timeouts were flushed before comparing.
1532 * Assuming not more than 2^31-1 events have happened since,
1533 * these subtractions won't have wrapped, so we can check if
1534 * target is in [last_seq, current_seq] by comparing the two.
1535 */
1536 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1537 events_got = seq - ctx->cq_last_tm_flush;
1538 if (events_got < events_needed)
360428f8 1539 break;
bfe68a22 1540
135fcde8 1541 list_del_init(&req->timeout.list);
1ee4160c 1542 io_kill_timeout(req, 0);
f18ee4cf 1543 }
f010505b 1544 ctx->cq_last_tm_flush = seq;
79ebeaee 1545 spin_unlock_irq(&ctx->timeout_lock);
360428f8 1546}
5262f567 1547
c072481d 1548static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
360428f8 1549{
2335f6f5
PB
1550 if (ctx->off_timeout_used)
1551 io_flush_timeouts(ctx);
1552 if (ctx->drain_active)
1553 io_queue_deferred(ctx);
1554}
1555
1556static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1557{
1558 if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1559 __io_commit_cqring_flush(ctx);
ec30e04b
PB
1560 /* order cqe stores with ring update */
1561 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
de0617e4
JA
1562}
1563
90554200
JA
1564static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1565{
1566 struct io_rings *r = ctx->rings;
1567
a566c556 1568 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
90554200
JA
1569}
1570
888aae2e
PB
1571static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1572{
1573 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1574}
1575
d068b506 1576static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
2b188cc1 1577{
75b28aff 1578 struct io_rings *rings = ctx->rings;
ea5ab3b5 1579 unsigned tail, mask = ctx->cq_entries - 1;
2b188cc1 1580
115e12e5
SB
1581 /*
1582 * writes to the cq entry need to come after reading head; the
1583 * control dependency is enough as we're using WRITE_ONCE to
1584 * fill the cq entry
1585 */
a566c556 1586 if (__io_cqring_events(ctx) == ctx->cq_entries)
2b188cc1
JA
1587 return NULL;
1588
888aae2e 1589 tail = ctx->cached_cq_tail++;
ea5ab3b5 1590 return &rings->cqes[tail & mask];
2b188cc1
JA
1591}
1592
f2842ab5
JA
1593static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1594{
44c769de 1595 if (likely(!ctx->cq_ev_fd))
f0b493e6 1596 return false;
7e55a19c
SG
1597 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1598 return false;
44c769de 1599 return !ctx->eventfd_async || io_wq_current_is_worker();
f2842ab5
JA
1600}
1601
2c5d763c
JA
1602/*
1603 * This should only get called when at least one event has been posted.
1604 * Some applications rely on the eventfd notification count only changing
1605 * IFF a new CQE has been added to the CQ ring. There's no depedency on
1606 * 1:1 relationship between how many times this function is called (and
1607 * hence the eventfd count) and number of CQEs posted to the CQ ring.
1608 */
b41e9852 1609static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5 1610{
5fd46178
JA
1611 /*
1612 * wake_up_all() may seem excessive, but io_wake_function() and
1613 * io_should_wake() handle the termination of the loop and only
1614 * wake as many waiters as we need to.
1615 */
1616 if (wq_has_sleeper(&ctx->cq_wait))
1617 wake_up_all(&ctx->cq_wait);
b41e9852 1618 if (io_should_trigger_evfd(ctx))
1d7bb1d5
JA
1619 eventfd_signal(ctx->cq_ev_fd, 1);
1620}
1621
80c18e4a
PB
1622static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1623{
c57a91fb
PB
1624 /* see waitqueue_active() comment */
1625 smp_mb();
1626
80c18e4a 1627 if (ctx->flags & IORING_SETUP_SQPOLL) {
c57a91fb 1628 if (waitqueue_active(&ctx->cq_wait))
5fd46178 1629 wake_up_all(&ctx->cq_wait);
80c18e4a
PB
1630 }
1631 if (io_should_trigger_evfd(ctx))
1632 eventfd_signal(ctx->cq_ev_fd, 1);
1633}
1634
c4a2ed72 1635/* Returns true if there are no backlogged entries after the flush */
6c2450ae 1636static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5 1637{
b18032bb 1638 bool all_flushed, posted;
1d7bb1d5 1639
a566c556 1640 if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
e23de15f 1641 return false;
1d7bb1d5 1642
b18032bb 1643 posted = false;
79ebeaee 1644 spin_lock(&ctx->completion_lock);
6c2450ae 1645 while (!list_empty(&ctx->cq_overflow_list)) {
d068b506 1646 struct io_uring_cqe *cqe = io_get_cqe(ctx);
6c2450ae 1647 struct io_overflow_cqe *ocqe;
e6c8aa9a 1648
1d7bb1d5
JA
1649 if (!cqe && !force)
1650 break;
6c2450ae
PB
1651 ocqe = list_first_entry(&ctx->cq_overflow_list,
1652 struct io_overflow_cqe, list);
1653 if (cqe)
1654 memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1655 else
8f6ed49a
PB
1656 io_account_cq_overflow(ctx);
1657
b18032bb 1658 posted = true;
6c2450ae
PB
1659 list_del(&ocqe->list);
1660 kfree(ocqe);
1d7bb1d5
JA
1661 }
1662
09e88404
PB
1663 all_flushed = list_empty(&ctx->cq_overflow_list);
1664 if (all_flushed) {
5ed7a37d 1665 clear_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
1666 WRITE_ONCE(ctx->rings->sq_flags,
1667 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
09e88404 1668 }
46930143 1669
b18032bb
JA
1670 if (posted)
1671 io_commit_cqring(ctx);
79ebeaee 1672 spin_unlock(&ctx->completion_lock);
b18032bb
JA
1673 if (posted)
1674 io_cqring_ev_posted(ctx);
09e88404 1675 return all_flushed;
1d7bb1d5
JA
1676}
1677
90f67366 1678static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
6c503150 1679{
ca0a2651
JA
1680 bool ret = true;
1681
5ed7a37d 1682 if (test_bit(0, &ctx->check_cq_overflow)) {
6c503150
PB
1683 /* iopoll syncs against uring_lock, not completion_lock */
1684 if (ctx->flags & IORING_SETUP_IOPOLL)
1685 mutex_lock(&ctx->uring_lock);
90f67366 1686 ret = __io_cqring_overflow_flush(ctx, false);
6c503150
PB
1687 if (ctx->flags & IORING_SETUP_IOPOLL)
1688 mutex_unlock(&ctx->uring_lock);
1689 }
ca0a2651
JA
1690
1691 return ret;
6c503150
PB
1692}
1693
6a290a14
PB
1694/* must to be called somewhat shortly after putting a request */
1695static inline void io_put_task(struct task_struct *task, int nr)
1696{
1697 struct io_uring_task *tctx = task->io_uring;
1698
e98e49b2
PB
1699 if (likely(task == current)) {
1700 tctx->cached_refs += nr;
1701 } else {
1702 percpu_counter_sub(&tctx->inflight, nr);
1703 if (unlikely(atomic_read(&tctx->in_idle)))
1704 wake_up(&tctx->wait);
1705 put_task_struct_many(task, nr);
1706 }
6a290a14
PB
1707}
1708
9a10867a
PB
1709static void io_task_refs_refill(struct io_uring_task *tctx)
1710{
1711 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
1712
1713 percpu_counter_add(&tctx->inflight, refill);
1714 refcount_add(refill, &current->usage);
1715 tctx->cached_refs += refill;
1716}
1717
1718static inline void io_get_task_refs(int nr)
1719{
1720 struct io_uring_task *tctx = current->io_uring;
1721
1722 tctx->cached_refs -= nr;
1723 if (unlikely(tctx->cached_refs < 0))
1724 io_task_refs_refill(tctx);
1725}
1726
d4d19c19 1727static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 1728 s32 res, u32 cflags)
2b188cc1 1729{
cce4b8b0 1730 struct io_overflow_cqe *ocqe;
2b188cc1 1731
cce4b8b0
PB
1732 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1733 if (!ocqe) {
1734 /*
1735 * If we're in ring overflow flush mode, or in task cancel mode,
1736 * or cannot allocate an overflow entry, then we need to drop it
1737 * on the floor.
1738 */
8f6ed49a 1739 io_account_cq_overflow(ctx);
cce4b8b0 1740 return false;
2b188cc1 1741 }
cce4b8b0 1742 if (list_empty(&ctx->cq_overflow_list)) {
5ed7a37d 1743 set_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
1744 WRITE_ONCE(ctx->rings->sq_flags,
1745 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1746
cce4b8b0 1747 }
d4d19c19 1748 ocqe->cqe.user_data = user_data;
cce4b8b0
PB
1749 ocqe->cqe.res = res;
1750 ocqe->cqe.flags = cflags;
1751 list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1752 return true;
2b188cc1
JA
1753}
1754
d4d19c19 1755static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 1756 s32 res, u32 cflags)
2b188cc1
JA
1757{
1758 struct io_uring_cqe *cqe;
1759
d4d19c19 1760 trace_io_uring_complete(ctx, user_data, res, cflags);
51c3ff62 1761
2b188cc1
JA
1762 /*
1763 * If we can't get a cq entry, userspace overflowed the
1764 * submission (by quite a lot). Increment the overflow count in
1765 * the ring.
1766 */
d068b506 1767 cqe = io_get_cqe(ctx);
1d7bb1d5 1768 if (likely(cqe)) {
d4d19c19 1769 WRITE_ONCE(cqe->user_data, user_data);
2b188cc1 1770 WRITE_ONCE(cqe->res, res);
bcda7baa 1771 WRITE_ONCE(cqe->flags, cflags);
8d13326e 1772 return true;
2b188cc1 1773 }
d4d19c19 1774 return io_cqring_event_overflow(ctx, user_data, res, cflags);
2b188cc1
JA
1775}
1776
8d13326e 1777/* not as hot to bloat with inlining */
d4d19c19 1778static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 1779 s32 res, u32 cflags)
bcda7baa 1780{
d4d19c19 1781 return __io_cqring_fill_event(ctx, user_data, res, cflags);
bcda7baa
JA
1782}
1783
54daa9b2
PB
1784static void io_req_complete_post(struct io_kiocb *req, s32 res,
1785 u32 cflags)
2b188cc1 1786{
78e19bbe 1787 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 1788
79ebeaee 1789 spin_lock(&ctx->completion_lock);
d4d19c19 1790 __io_cqring_fill_event(ctx, req->user_data, res, cflags);
c7dae4ba
JA
1791 /*
1792 * If we're the last reference to this request, add to our locked
1793 * free_list cache.
1794 */
de9b4cca 1795 if (req_ref_put_and_test(req)) {
7a612350 1796 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
0756a869 1797 if (req->flags & IO_DISARM_MASK)
7a612350
PB
1798 io_disarm_next(req);
1799 if (req->link) {
1800 io_req_task_queue(req->link);
1801 req->link = NULL;
1802 }
1803 }
c7dae4ba
JA
1804 io_dismantle_req(req);
1805 io_put_task(req->task, 1);
c2b6c6bc 1806 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
d0acdee2 1807 ctx->locked_free_nr++;
180f829f 1808 }
7a612350 1809 io_commit_cqring(ctx);
79ebeaee 1810 spin_unlock(&ctx->completion_lock);
a3f34907 1811 io_cqring_ev_posted(ctx);
229a7b63
JA
1812}
1813
54daa9b2
PB
1814static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
1815 u32 cflags)
229a7b63 1816{
a38d68db 1817 req->result = res;
d17e56eb 1818 req->cflags = cflags;
e342c807 1819 req->flags |= REQ_F_COMPLETE_INLINE;
e1e16097
JA
1820}
1821
889fca73 1822static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
54daa9b2 1823 s32 res, u32 cflags)
bcda7baa 1824{
889fca73
PB
1825 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1826 io_req_complete_state(req, res, cflags);
a38d68db 1827 else
c7dae4ba 1828 io_req_complete_post(req, res, cflags);
bcda7baa
JA
1829}
1830
54daa9b2 1831static inline void io_req_complete(struct io_kiocb *req, s32 res)
0ddf92e8 1832{
889fca73 1833 __io_req_complete(req, 0, res, 0);
0ddf92e8
JA
1834}
1835
54daa9b2 1836static void io_req_complete_failed(struct io_kiocb *req, s32 res)
f41db273 1837{
93d2bcd2 1838 req_set_fail(req);
f41db273
PB
1839 io_req_complete_post(req, res, 0);
1840}
1841
c6d3d9cb
PB
1842static void io_req_complete_fail_submit(struct io_kiocb *req)
1843{
1844 /*
1845 * We don't submit, fail them all, for that replace hardlinks with
1846 * normal links. Extra REQ_F_LINK is tolerated.
1847 */
1848 req->flags &= ~REQ_F_HARDLINK;
1849 req->flags |= REQ_F_LINK;
1850 io_req_complete_failed(req, req->result);
1851}
1852
864ea921
PB
1853/*
1854 * Don't initialise the fields below on every allocation, but do that in
1855 * advance and keep them valid across allocations.
1856 */
1857static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1858{
1859 req->ctx = ctx;
1860 req->link = NULL;
1861 req->async_data = NULL;
1862 /* not necessary, but safer to zero */
1863 req->result = 0;
1864}
1865
dac7a098 1866static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
cd0ca2e0 1867 struct io_submit_state *state)
dac7a098 1868{
79ebeaee 1869 spin_lock(&ctx->completion_lock);
c2b6c6bc 1870 wq_list_splice(&ctx->locked_free_list, &state->free_list);
d0acdee2 1871 ctx->locked_free_nr = 0;
79ebeaee 1872 spin_unlock(&ctx->completion_lock);
dac7a098
PB
1873}
1874
dd78f492 1875/* Returns true IFF there are requests in the cache */
c7dae4ba 1876static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
0ddf92e8 1877{
c7dae4ba 1878 struct io_submit_state *state = &ctx->submit_state;
0ddf92e8 1879
c7dae4ba
JA
1880 /*
1881 * If we have more than a batch's worth of requests in our IRQ side
1882 * locked cache, grab the lock and move them over to our submission
1883 * side cache.
1884 */
d0acdee2 1885 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
cd0ca2e0 1886 io_flush_cached_locked_reqs(ctx, state);
c2b6c6bc 1887 return !!state->free_list.next;
0ddf92e8
JA
1888}
1889
5d5901a3
PB
1890/*
1891 * A request might get retired back into the request caches even before opcode
1892 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
1893 * Because of that, io_alloc_req() should be called only under ->uring_lock
1894 * and with extra caution to not get a request that is still worked on.
1895 */
c072481d 1896static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
5d5901a3 1897 __must_hold(&ctx->uring_lock)
2b188cc1 1898{
e5d1bc0a 1899 struct io_submit_state *state = &ctx->submit_state;
864ea921 1900 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
3ab665b7
PB
1901 void *reqs[IO_REQ_ALLOC_BATCH];
1902 struct io_kiocb *req;
864ea921 1903 int ret, i;
e5d1bc0a 1904
c2b6c6bc 1905 if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
a33ae9ce 1906 return true;
e5d1bc0a 1907
3ab665b7 1908 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
fd6fab2c 1909
864ea921
PB
1910 /*
1911 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1912 * retry single alloc to be on the safe side.
1913 */
1914 if (unlikely(ret <= 0)) {
3ab665b7
PB
1915 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1916 if (!reqs[0])
a33ae9ce 1917 return false;
864ea921 1918 ret = 1;
2b188cc1 1919 }
864ea921 1920
37f0e767 1921 percpu_ref_get_many(&ctx->refs, ret);
3ab665b7
PB
1922 for (i = 0; i < ret; i++) {
1923 req = reqs[i];
1924
1925 io_preinit_req(req, ctx);
c2b6c6bc 1926 wq_stack_add_head(&req->comp_list, &state->free_list);
3ab665b7 1927 }
a33ae9ce
PB
1928 return true;
1929}
1930
1931static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
1932{
1933 if (unlikely(!ctx->submit_state.free_list.next))
1934 return __io_alloc_req_refill(ctx);
1935 return true;
1936}
1937
1938static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1939{
1940 struct io_wq_work_node *node;
1941
1942 node = wq_stack_extract(&ctx->submit_state.free_list);
c2b6c6bc 1943 return container_of(node, struct io_kiocb, comp_list);
2b188cc1
JA
1944}
1945
e1d767f0 1946static inline void io_put_file(struct file *file)
8da11c19 1947{
e1d767f0 1948 if (file)
8da11c19
PB
1949 fput(file);
1950}
1951
6b639522 1952static inline void io_dismantle_req(struct io_kiocb *req)
2b188cc1 1953{
094bae49 1954 unsigned int flags = req->flags;
929a3af9 1955
867f8fa5 1956 if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
3a0a6902 1957 io_clean_op(req);
e1d767f0
PB
1958 if (!(flags & REQ_F_FIXED_FILE))
1959 io_put_file(req->file);
269bbe5f
BM
1960 if (req->fixed_rsrc_refs)
1961 percpu_ref_put(req->fixed_rsrc_refs);
e65ef56d
JA
1962}
1963
c072481d 1964static __cold void __io_free_req(struct io_kiocb *req)
c6ca97b3 1965{
51a4cc11 1966 struct io_ring_ctx *ctx = req->ctx;
c6ca97b3 1967
216578e5 1968 io_dismantle_req(req);
7c660731 1969 io_put_task(req->task, 1);
c6ca97b3 1970
79ebeaee 1971 spin_lock(&ctx->completion_lock);
c2b6c6bc 1972 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
c34b025f 1973 ctx->locked_free_nr++;
79ebeaee 1974 spin_unlock(&ctx->completion_lock);
e65ef56d
JA
1975}
1976
f2f87370
PB
1977static inline void io_remove_next_linked(struct io_kiocb *req)
1978{
1979 struct io_kiocb *nxt = req->link;
1980
1981 req->link = nxt->link;
1982 nxt->link = NULL;
1983}
1984
33cc89a9
PB
1985static bool io_kill_linked_timeout(struct io_kiocb *req)
1986 __must_hold(&req->ctx->completion_lock)
89b263f6 1987 __must_hold(&req->ctx->timeout_lock)
2665abfd 1988{
33cc89a9 1989 struct io_kiocb *link = req->link;
f2f87370 1990
b97e736a 1991 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
c9abd7ad 1992 struct io_timeout_data *io = link->async_data;
7c86ffee 1993
f2f87370 1994 io_remove_next_linked(req);
90cd7e42 1995 link->timeout.head = NULL;
fd9c7bc5 1996 if (hrtimer_try_to_cancel(&io->timer) != -1) {
ef9dd637 1997 list_del(&link->timeout.list);
d4d19c19
PB
1998 io_cqring_fill_event(link->ctx, link->user_data,
1999 -ECANCELED, 0);
91c2f697 2000 io_put_req_deferred(link);
d4729fbd 2001 return true;
c9abd7ad
PB
2002 }
2003 }
d4729fbd 2004 return false;
7c86ffee
PB
2005}
2006
d148ca4b 2007static void io_fail_links(struct io_kiocb *req)
33cc89a9 2008 __must_hold(&req->ctx->completion_lock)
9e645e11 2009{
33cc89a9 2010 struct io_kiocb *nxt, *link = req->link;
9e645e11 2011
f2f87370 2012 req->link = NULL;
f2f87370 2013 while (link) {
a8295b98
HX
2014 long res = -ECANCELED;
2015
2016 if (link->flags & REQ_F_FAIL)
2017 res = link->result;
2018
f2f87370
PB
2019 nxt = link->link;
2020 link->link = NULL;
2665abfd 2021
f2f87370 2022 trace_io_uring_fail_link(req, link);
a8295b98 2023 io_cqring_fill_event(link->ctx, link->user_data, res, 0);
91c2f697 2024 io_put_req_deferred(link);
f2f87370 2025 link = nxt;
9e645e11 2026 }
33cc89a9 2027}
9e645e11 2028
33cc89a9
PB
2029static bool io_disarm_next(struct io_kiocb *req)
2030 __must_hold(&req->ctx->completion_lock)
2031{
2032 bool posted = false;
2033
0756a869
PB
2034 if (req->flags & REQ_F_ARM_LTIMEOUT) {
2035 struct io_kiocb *link = req->link;
2036
906c6caa 2037 req->flags &= ~REQ_F_ARM_LTIMEOUT;
0756a869
PB
2038 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2039 io_remove_next_linked(req);
2040 io_cqring_fill_event(link->ctx, link->user_data,
2041 -ECANCELED, 0);
2042 io_put_req_deferred(link);
2043 posted = true;
2044 }
2045 } else if (req->flags & REQ_F_LINK_TIMEOUT) {
89b263f6
JA
2046 struct io_ring_ctx *ctx = req->ctx;
2047
2048 spin_lock_irq(&ctx->timeout_lock);
33cc89a9 2049 posted = io_kill_linked_timeout(req);
89b263f6
JA
2050 spin_unlock_irq(&ctx->timeout_lock);
2051 }
93d2bcd2 2052 if (unlikely((req->flags & REQ_F_FAIL) &&
e4335ed3 2053 !(req->flags & REQ_F_HARDLINK))) {
33cc89a9
PB
2054 posted |= (req->link != NULL);
2055 io_fail_links(req);
2056 }
2057 return posted;
9e645e11
JA
2058}
2059
d81499bf
PB
2060static void __io_req_find_next_prep(struct io_kiocb *req)
2061{
2062 struct io_ring_ctx *ctx = req->ctx;
2063 bool posted;
2064
2065 spin_lock(&ctx->completion_lock);
2066 posted = io_disarm_next(req);
2067 if (posted)
2068 io_commit_cqring(req->ctx);
2069 spin_unlock(&ctx->completion_lock);
2070 if (posted)
2071 io_cqring_ev_posted(ctx);
2072}
2073
2074static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
c69f8dbe 2075{
33cc89a9 2076 struct io_kiocb *nxt;
944e58bf 2077
d81499bf
PB
2078 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2079 return NULL;
9e645e11
JA
2080 /*
2081 * If LINK is set, we have dependent requests in this chain. If we
2082 * didn't fail this request, queue the first one up, moving any other
2083 * dependencies to the next request. In case of failure, fail the rest
2084 * of the chain.
2085 */
d81499bf
PB
2086 if (unlikely(req->flags & IO_DISARM_MASK))
2087 __io_req_find_next_prep(req);
33cc89a9
PB
2088 nxt = req->link;
2089 req->link = NULL;
2090 return nxt;
4d7dd462 2091}
9e645e11 2092
f237c30a 2093static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2c32395d
PB
2094{
2095 if (!ctx)
2096 return;
f237c30a 2097 if (*locked) {
c450178d 2098 io_submit_flush_completions(ctx);
2c32395d 2099 mutex_unlock(&ctx->uring_lock);
f237c30a 2100 *locked = false;
2c32395d
PB
2101 }
2102 percpu_ref_put(&ctx->refs);
2103}
2104
7cbf1722 2105static void tctx_task_work(struct callback_head *cb)
c40f6379 2106{
f237c30a 2107 bool locked = false;
ebd0df2e 2108 struct io_ring_ctx *ctx = NULL;
3f18407d
PB
2109 struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2110 task_work);
c40f6379 2111
16f72070 2112 while (1) {
3f18407d
PB
2113 struct io_wq_work_node *node;
2114
c450178d 2115 if (!tctx->task_list.first && locked)
8d4ad41e
PB
2116 io_submit_flush_completions(ctx);
2117
3f18407d 2118 spin_lock_irq(&tctx->task_lock);
c6538be9 2119 node = tctx->task_list.first;
3f18407d 2120 INIT_WQ_LIST(&tctx->task_list);
6294f368
PB
2121 if (!node)
2122 tctx->task_running = false;
3f18407d 2123 spin_unlock_irq(&tctx->task_lock);
6294f368
PB
2124 if (!node)
2125 break;
3f18407d 2126
6294f368 2127 do {
3f18407d
PB
2128 struct io_wq_work_node *next = node->next;
2129 struct io_kiocb *req = container_of(node, struct io_kiocb,
2130 io_task_work.node);
2131
2132 if (req->ctx != ctx) {
f237c30a 2133 ctx_flush_and_put(ctx, &locked);
3f18407d 2134 ctx = req->ctx;
126180b9
PB
2135 /* if not contended, grab and improve batching */
2136 locked = mutex_trylock(&ctx->uring_lock);
3f18407d
PB
2137 percpu_ref_get(&ctx->refs);
2138 }
f237c30a 2139 req->io_task_work.func(req, &locked);
3f18407d 2140 node = next;
6294f368
PB
2141 } while (node);
2142
7cbf1722 2143 cond_resched();
3f18407d 2144 }
ebd0df2e 2145
f237c30a 2146 ctx_flush_and_put(ctx, &locked);
7cbf1722
JA
2147}
2148
e09ee510 2149static void io_req_task_work_add(struct io_kiocb *req)
7cbf1722 2150{
c15b79de 2151 struct task_struct *tsk = req->task;
7cbf1722 2152 struct io_uring_task *tctx = tsk->io_uring;
c15b79de 2153 enum task_work_notify_mode notify;
e09ee510 2154 struct io_wq_work_node *node;
0b81e80c 2155 unsigned long flags;
6294f368 2156 bool running;
7cbf1722
JA
2157
2158 WARN_ON_ONCE(!tctx);
2159
0b81e80c 2160 spin_lock_irqsave(&tctx->task_lock, flags);
7cbf1722 2161 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
6294f368
PB
2162 running = tctx->task_running;
2163 if (!running)
2164 tctx->task_running = true;
0b81e80c 2165 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
2166
2167 /* task_work already pending, we're done */
6294f368 2168 if (running)
e09ee510 2169 return;
7cbf1722 2170
c15b79de
PB
2171 /*
2172 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2173 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2174 * processing task_work. There's no reliable way to tell if TWA_RESUME
2175 * will do the job.
2176 */
2177 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
d97ec623
PB
2178 if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
2179 if (notify == TWA_NONE)
2180 wake_up_process(tsk);
e09ee510 2181 return;
c15b79de 2182 }
2215bed9 2183
0b81e80c 2184 spin_lock_irqsave(&tctx->task_lock, flags);
6294f368 2185 tctx->task_running = false;
e09ee510
PB
2186 node = tctx->task_list.first;
2187 INIT_WQ_LIST(&tctx->task_list);
0b81e80c 2188 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722 2189
e09ee510
PB
2190 while (node) {
2191 req = container_of(node, struct io_kiocb, io_task_work.node);
2192 node = node->next;
2193 if (llist_add(&req->io_task_work.fallback_node,
2194 &req->ctx->fallback_llist))
2195 schedule_delayed_work(&req->ctx->fallback_work, 1);
2196 }
eab30c4d
PB
2197}
2198
f237c30a 2199static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
c40f6379 2200{
87ceb6a6 2201 struct io_ring_ctx *ctx = req->ctx;
c40f6379 2202
b18a1a45 2203 /* not needed for normal modes, but SQPOLL depends on it */
f237c30a 2204 io_tw_lock(ctx, locked);
2593553a 2205 io_req_complete_failed(req, req->result);
c40f6379
JA
2206}
2207
f237c30a 2208static void io_req_task_submit(struct io_kiocb *req, bool *locked)
c40f6379
JA
2209{
2210 struct io_ring_ctx *ctx = req->ctx;
2211
f237c30a 2212 io_tw_lock(ctx, locked);
316319e8 2213 /* req->task == current here, checking PF_EXITING is safe */
af066f31 2214 if (likely(!(req->task->flags & PF_EXITING)))
c5eef2b9 2215 __io_queue_sqe(req);
81b6d05c 2216 else
2593553a 2217 io_req_complete_failed(req, -EFAULT);
c40f6379
JA
2218}
2219
2c4b8eb6 2220static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
c40f6379 2221{
2c4b8eb6 2222 req->result = ret;
5b0a6acc 2223 req->io_task_work.func = io_req_task_cancel;
e09ee510 2224 io_req_task_work_add(req);
c40f6379
JA
2225}
2226
2c4b8eb6 2227static void io_req_task_queue(struct io_kiocb *req)
a3df7698 2228{
5b0a6acc 2229 req->io_task_work.func = io_req_task_submit;
e09ee510 2230 io_req_task_work_add(req);
a3df7698
PB
2231}
2232
773af691
JA
2233static void io_req_task_queue_reissue(struct io_kiocb *req)
2234{
2235 req->io_task_work.func = io_queue_async_work;
2236 io_req_task_work_add(req);
2237}
2238
f2f87370 2239static inline void io_queue_next(struct io_kiocb *req)
c69f8dbe 2240{
9b5f7bd9 2241 struct io_kiocb *nxt = io_req_find_next(req);
944e58bf
PB
2242
2243 if (nxt)
906a8c3f 2244 io_req_task_queue(nxt);
c69f8dbe
JL
2245}
2246
c3524383 2247static void io_free_req(struct io_kiocb *req)
7a743e22 2248{
c3524383
PB
2249 io_queue_next(req);
2250 __io_free_req(req);
2251}
8766dd51 2252
f237c30a
PB
2253static void io_free_req_work(struct io_kiocb *req, bool *locked)
2254{
2255 io_free_req(req);
2256}
2257
3aa83bfb 2258static void io_free_batch_list(struct io_ring_ctx *ctx,
1cce17ac 2259 struct io_wq_work_node *node)
3aa83bfb
PB
2260 __must_hold(&ctx->uring_lock)
2261{
d4b7a5ef 2262 struct task_struct *task = NULL;
37f0e767 2263 int task_refs = 0;
3aa83bfb 2264
3aa83bfb
PB
2265 do {
2266 struct io_kiocb *req = container_of(node, struct io_kiocb,
2267 comp_list);
2268
c1e53a69
PB
2269 if (!req_ref_put_and_test(req)) {
2270 node = req->comp_list.next;
d4b7a5ef 2271 continue;
c1e53a69 2272 }
d4b7a5ef
PB
2273
2274 io_queue_next(req);
2275 io_dismantle_req(req);
2276
2277 if (req->task != task) {
2278 if (task)
2279 io_put_task(task, task_refs);
2280 task = req->task;
2281 task_refs = 0;
2282 }
2283 task_refs++;
c1e53a69 2284 node = req->comp_list.next;
d4b7a5ef 2285 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
3aa83bfb 2286 } while (node);
d4b7a5ef 2287
d4b7a5ef
PB
2288 if (task)
2289 io_put_task(task, task_refs);
3aa83bfb
PB
2290}
2291
c450178d 2292static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
a141dd89 2293 __must_hold(&ctx->uring_lock)
905c172f 2294{
6f33b0bc 2295 struct io_wq_work_node *node, *prev;
cd0ca2e0 2296 struct io_submit_state *state = &ctx->submit_state;
905c172f 2297
79ebeaee 2298 spin_lock(&ctx->completion_lock);
6f33b0bc
PB
2299 wq_list_for_each(node, prev, &state->compl_reqs) {
2300 struct io_kiocb *req = container_of(node, struct io_kiocb,
2301 comp_list);
5182ed2e 2302
d4d19c19 2303 __io_cqring_fill_event(ctx, req->user_data, req->result,
d17e56eb 2304 req->cflags);
905c172f
PB
2305 }
2306 io_commit_cqring(ctx);
79ebeaee 2307 spin_unlock(&ctx->completion_lock);
905c172f 2308 io_cqring_ev_posted(ctx);
5182ed2e 2309
1cce17ac 2310 io_free_batch_list(ctx, state->compl_reqs.first);
6f33b0bc 2311 INIT_WQ_LIST(&state->compl_reqs);
7a743e22
PB
2312}
2313
ba816ad6
JA
2314/*
2315 * Drop reference to request, return next in chain (if there is one) if this
2316 * was the last reference to this request.
2317 */
0d85035a 2318static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
e65ef56d 2319{
9b5f7bd9
PB
2320 struct io_kiocb *nxt = NULL;
2321
de9b4cca 2322 if (req_ref_put_and_test(req)) {
9b5f7bd9 2323 nxt = io_req_find_next(req);
4d7dd462 2324 __io_free_req(req);
2a44f467 2325 }
9b5f7bd9 2326 return nxt;
2b188cc1
JA
2327}
2328
0d85035a 2329static inline void io_put_req(struct io_kiocb *req)
e65ef56d 2330{
de9b4cca 2331 if (req_ref_put_and_test(req))
e65ef56d 2332 io_free_req(req);
2b188cc1
JA
2333}
2334
91c2f697 2335static inline void io_put_req_deferred(struct io_kiocb *req)
216578e5 2336{
91c2f697 2337 if (req_ref_put_and_test(req)) {
f237c30a 2338 req->io_task_work.func = io_free_req_work;
543af3a1
PB
2339 io_req_task_work_add(req);
2340 }
216578e5
PB
2341}
2342
6c503150 2343static unsigned io_cqring_events(struct io_ring_ctx *ctx)
a3a0e43f
JA
2344{
2345 /* See comment at the top of this file */
2346 smp_rmb();
e23de15f 2347 return __io_cqring_events(ctx);
a3a0e43f
JA
2348}
2349
fb5ccc98
PB
2350static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2351{
2352 struct io_rings *rings = ctx->rings;
2353
2354 /* make sure SQ entry isn't read before tail */
2355 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2356}
2357
8ff069bf 2358static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
e94f141b 2359{
8ff069bf 2360 unsigned int cflags;
e94f141b 2361
bcda7baa
JA
2362 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2363 cflags |= IORING_CQE_F_BUFFER;
0e1b6fe3 2364 req->flags &= ~REQ_F_BUFFER_SELECTED;
bcda7baa
JA
2365 kfree(kbuf);
2366 return cflags;
e94f141b
JA
2367}
2368
8ff069bf 2369static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
bcda7baa 2370{
ae421d93
PB
2371 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
2372 return 0;
30d51dd4 2373 return io_put_kbuf(req, req->kbuf);
8ff069bf
PB
2374}
2375
4c6e277c
JA
2376static inline bool io_run_task_work(void)
2377{
ef98eb04 2378 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
4c6e277c 2379 __set_current_state(TASK_RUNNING);
ef98eb04 2380 tracehook_notify_signal();
4c6e277c
JA
2381 return true;
2382 }
2383
2384 return false;
bcda7baa
JA
2385}
2386
5ba3c874 2387static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
def596e9 2388{
5eef4e87 2389 struct io_wq_work_node *pos, *start, *prev;
d729cf9a 2390 unsigned int poll_flags = BLK_POLL_NOSLEEP;
b688f11e 2391 DEFINE_IO_COMP_BATCH(iob);
5ba3c874 2392 int nr_events = 0;
def596e9
JA
2393
2394 /*
2395 * Only spin for completions if we don't have multiple devices hanging
87a115fb 2396 * off our complete list.
def596e9 2397 */
87a115fb 2398 if (ctx->poll_multi_queue || force_nonspin)
ef99b2d3 2399 poll_flags |= BLK_POLL_ONESHOT;
def596e9 2400
5eef4e87
PB
2401 wq_list_for_each(pos, start, &ctx->iopoll_list) {
2402 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
9adbd45d 2403 struct kiocb *kiocb = &req->rw.kiocb;
a2416e1e 2404 int ret;
def596e9
JA
2405
2406 /*
581f9810
BM
2407 * Move completed and retryable entries to our local lists.
2408 * If we find a request that requires polling, break out
2409 * and complete those lists first, if we have entries there.
def596e9 2410 */
e3f721e6 2411 if (READ_ONCE(req->iopoll_completed))
def596e9
JA
2412 break;
2413
b688f11e 2414 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
a2416e1e
PB
2415 if (unlikely(ret < 0))
2416 return ret;
2417 else if (ret)
ef99b2d3 2418 poll_flags |= BLK_POLL_ONESHOT;
def596e9 2419
3aadc23e 2420 /* iopoll may have completed current req */
b688f11e 2421 if (!rq_list_empty(iob.req_list) ||
e3f721e6
PB
2422 READ_ONCE(req->iopoll_completed))
2423 break;
def596e9
JA
2424 }
2425
b688f11e
JA
2426 if (!rq_list_empty(iob.req_list))
2427 iob.complete(&iob);
5eef4e87
PB
2428 else if (!pos)
2429 return 0;
2430
2431 prev = start;
2432 wq_list_for_each_resume(pos, prev) {
2433 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2434
b3fa03fd
PB
2435 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
2436 if (!smp_load_acquire(&req->iopoll_completed))
e3f721e6 2437 break;
b3fa03fd 2438 __io_cqring_fill_event(ctx, req->user_data, req->result,
f5ed3bcd 2439 io_put_rw_kbuf(req));
e3f721e6
PB
2440 nr_events++;
2441 }
def596e9 2442
f5ed3bcd
PB
2443 if (unlikely(!nr_events))
2444 return 0;
2445
2446 io_commit_cqring(ctx);
2447 io_cqring_ev_posted_iopoll(ctx);
1cce17ac 2448 pos = start ? start->next : ctx->iopoll_list.first;
5eef4e87 2449 wq_list_cut(&ctx->iopoll_list, prev, start);
1cce17ac 2450 io_free_batch_list(ctx, pos);
5ba3c874 2451 return nr_events;
def596e9
JA
2452}
2453
def596e9
JA
2454/*
2455 * We can't just wait for polled events to come to us, we have to actively
2456 * find and complete them.
2457 */
c072481d 2458static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
def596e9
JA
2459{
2460 if (!(ctx->flags & IORING_SETUP_IOPOLL))
2461 return;
2462
2463 mutex_lock(&ctx->uring_lock);
5eef4e87 2464 while (!wq_list_empty(&ctx->iopoll_list)) {
b2edc0a7 2465 /* let it sleep and repeat later if can't complete a request */
5ba3c874 2466 if (io_do_iopoll(ctx, true) == 0)
b2edc0a7 2467 break;
08f5439f
JA
2468 /*
2469 * Ensure we allow local-to-the-cpu processing to take place,
2470 * in this case we need to ensure that we reap all events.
3fcee5a6 2471 * Also let task_work, etc. to progress by releasing the mutex
08f5439f 2472 */
3fcee5a6
PB
2473 if (need_resched()) {
2474 mutex_unlock(&ctx->uring_lock);
2475 cond_resched();
2476 mutex_lock(&ctx->uring_lock);
2477 }
def596e9
JA
2478 }
2479 mutex_unlock(&ctx->uring_lock);
2480}
2481
7668b92a 2482static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
def596e9 2483{
7668b92a 2484 unsigned int nr_events = 0;
e9979b36 2485 int ret = 0;
500f9fba 2486
c7849be9
XW
2487 /*
2488 * We disallow the app entering submit/complete with polling, but we
2489 * still need to lock the ring to prevent racing with polled issue
2490 * that got punted to a workqueue.
2491 */
2492 mutex_lock(&ctx->uring_lock);
f39c8a5b
PB
2493 /*
2494 * Don't enter poll loop if we already have events pending.
2495 * If we do, we can potentially be spinning for commands that
2496 * already triggered a CQE (eg in error).
2497 */
5ed7a37d 2498 if (test_bit(0, &ctx->check_cq_overflow))
f39c8a5b
PB
2499 __io_cqring_overflow_flush(ctx, false);
2500 if (io_cqring_events(ctx))
2501 goto out;
def596e9 2502 do {
500f9fba
JA
2503 /*
2504 * If a submit got punted to a workqueue, we can have the
2505 * application entering polling for a command before it gets
2506 * issued. That app will hold the uring_lock for the duration
2507 * of the poll right here, so we need to take a breather every
2508 * now and then to ensure that the issue has a chance to add
2509 * the poll to the issued list. Otherwise we can spin here
2510 * forever, while the workqueue is stuck trying to acquire the
2511 * very same mutex.
2512 */
5eef4e87 2513 if (wq_list_empty(&ctx->iopoll_list)) {
8f487ef2
PB
2514 u32 tail = ctx->cached_cq_tail;
2515
500f9fba 2516 mutex_unlock(&ctx->uring_lock);
4c6e277c 2517 io_run_task_work();
500f9fba 2518 mutex_lock(&ctx->uring_lock);
def596e9 2519
8f487ef2
PB
2520 /* some requests don't go through iopoll_list */
2521 if (tail != ctx->cached_cq_tail ||
5eef4e87 2522 wq_list_empty(&ctx->iopoll_list))
e9979b36 2523 break;
500f9fba 2524 }
5ba3c874
PB
2525 ret = io_do_iopoll(ctx, !min);
2526 if (ret < 0)
2527 break;
2528 nr_events += ret;
2529 ret = 0;
2530 } while (nr_events < min && !need_resched());
f39c8a5b 2531out:
500f9fba 2532 mutex_unlock(&ctx->uring_lock);
def596e9
JA
2533 return ret;
2534}
2535
491381ce 2536static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 2537{
491381ce
JA
2538 /*
2539 * Tell lockdep we inherited freeze protection from submission
2540 * thread.
2541 */
2542 if (req->flags & REQ_F_ISREG) {
1c98679d 2543 struct super_block *sb = file_inode(req->file)->i_sb;
2b188cc1 2544
1c98679d
PB
2545 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2546 sb_end_write(sb);
2b188cc1
JA
2547 }
2548}
2549
b63534c4 2550#ifdef CONFIG_BLOCK
dc2a6e9a 2551static bool io_resubmit_prep(struct io_kiocb *req)
b63534c4 2552{
ab454438 2553 struct io_async_rw *rw = req->async_data;
b63534c4 2554
d886e185 2555 if (!req_has_async_data(req))
ab454438 2556 return !io_req_prep_async(req);
cd658695 2557 iov_iter_restore(&rw->iter, &rw->iter_state);
ab454438 2558 return true;
b63534c4 2559}
b63534c4 2560
3e6a0d3c 2561static bool io_rw_should_reissue(struct io_kiocb *req)
b63534c4 2562{
355afaeb 2563 umode_t mode = file_inode(req->file)->i_mode;
3e6a0d3c 2564 struct io_ring_ctx *ctx = req->ctx;
b63534c4 2565
355afaeb
JA
2566 if (!S_ISBLK(mode) && !S_ISREG(mode))
2567 return false;
3e6a0d3c
JA
2568 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2569 !(ctx->flags & IORING_SETUP_IOPOLL)))
b63534c4 2570 return false;
7c977a58
JA
2571 /*
2572 * If ref is dying, we might be running poll reap from the exit work.
2573 * Don't attempt to reissue from that path, just let it fail with
2574 * -EAGAIN.
2575 */
3e6a0d3c
JA
2576 if (percpu_ref_is_dying(&ctx->refs))
2577 return false;
ef046888
JA
2578 /*
2579 * Play it safe and assume not safe to re-import and reissue if we're
2580 * not in the original thread group (or in task context).
2581 */
2582 if (!same_thread_group(req->task, current) || !in_task())
2583 return false;
3e6a0d3c
JA
2584 return true;
2585}
e82ad485 2586#else
a1ff1e3f 2587static bool io_resubmit_prep(struct io_kiocb *req)
e82ad485
JA
2588{
2589 return false;
2590}
e82ad485 2591static bool io_rw_should_reissue(struct io_kiocb *req)
3e6a0d3c 2592{
b63534c4
JA
2593 return false;
2594}
3e6a0d3c 2595#endif
b63534c4 2596
8ef12efe 2597static bool __io_complete_rw_common(struct io_kiocb *req, long res)
a1d7c393 2598{
b65c128f
PB
2599 if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2600 kiocb_end_write(req);
9532b99b
PB
2601 if (res != req->result) {
2602 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2603 io_rw_should_reissue(req)) {
2604 req->flags |= REQ_F_REISSUE;
8ef12efe 2605 return true;
9532b99b 2606 }
93d2bcd2 2607 req_set_fail(req);
8ef12efe 2608 req->result = res;
9532b99b 2609 }
8ef12efe
JA
2610 return false;
2611}
2612
f237c30a 2613static void io_req_task_complete(struct io_kiocb *req, bool *locked)
8ef12efe 2614{
126180b9 2615 unsigned int cflags = io_put_rw_kbuf(req);
54daa9b2 2616 int res = req->result;
126180b9 2617
fff4e40e 2618 if (*locked) {
126180b9 2619 io_req_complete_state(req, res, cflags);
fff4e40e
PB
2620 io_req_add_compl_list(req);
2621 } else {
126180b9 2622 io_req_complete_post(req, res, cflags);
fff4e40e 2623 }
8ef12efe
JA
2624}
2625
2626static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2627 unsigned int issue_flags)
2628{
2629 if (__io_complete_rw_common(req, res))
2630 return;
63637853 2631 __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
ba816ad6
JA
2632}
2633
2634static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2635{
9adbd45d 2636 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6 2637
8ef12efe
JA
2638 if (__io_complete_rw_common(req, res))
2639 return;
2640 req->result = res;
2641 req->io_task_work.func = io_req_task_complete;
2642 io_req_task_work_add(req);
2b188cc1
JA
2643}
2644
def596e9
JA
2645static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2646{
9adbd45d 2647 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 2648
491381ce
JA
2649 if (kiocb->ki_flags & IOCB_WRITE)
2650 kiocb_end_write(req);
9532b99b 2651 if (unlikely(res != req->result)) {
b66ceaf3
PB
2652 if (res == -EAGAIN && io_rw_should_reissue(req)) {
2653 req->flags |= REQ_F_REISSUE;
2654 return;
9532b99b 2655 }
8c130827 2656 }
bbde017a 2657
b3fa03fd
PB
2658 req->result = res;
2659 /* order with io_iopoll_complete() checking ->iopoll_completed */
2660 smp_store_release(&req->iopoll_completed, 1);
def596e9
JA
2661}
2662
2663/*
2664 * After the iocb has been issued, it's safe to be found on the poll list.
2665 * Adding the kiocb to the list AFTER submission ensures that we don't
f39c8a5b 2666 * find it from a io_do_iopoll() thread before the issuer is done
def596e9
JA
2667 * accessing the kiocb cookie.
2668 */
cb3d8972 2669static void io_iopoll_req_issued(struct io_kiocb *req)
def596e9
JA
2670{
2671 struct io_ring_ctx *ctx = req->ctx;
cb3d8972
PB
2672 const bool in_async = io_wq_current_is_worker();
2673
2674 /* workqueue context doesn't hold uring_lock, grab it now */
2675 if (unlikely(in_async))
2676 mutex_lock(&ctx->uring_lock);
def596e9
JA
2677
2678 /*
2679 * Track whether we have multiple files in our lists. This will impact
2680 * how we do polling eventually, not spinning if we're on potentially
2681 * different devices.
2682 */
5eef4e87 2683 if (wq_list_empty(&ctx->iopoll_list)) {
915b3dde
HX
2684 ctx->poll_multi_queue = false;
2685 } else if (!ctx->poll_multi_queue) {
def596e9
JA
2686 struct io_kiocb *list_req;
2687
5eef4e87
PB
2688 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
2689 comp_list);
30da1b45 2690 if (list_req->file != req->file)
915b3dde 2691 ctx->poll_multi_queue = true;
def596e9
JA
2692 }
2693
2694 /*
2695 * For fast devices, IO may have already completed. If it has, add
2696 * it to the front so we find it first.
2697 */
65a6543d 2698 if (READ_ONCE(req->iopoll_completed))
5eef4e87 2699 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
def596e9 2700 else
5eef4e87 2701 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
bdcd3eab 2702
cb3d8972
PB
2703 if (unlikely(in_async)) {
2704 /*
2705 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2706 * in sq thread task context or in io worker task context. If
2707 * current task context is sq thread, we don't need to check
2708 * whether should wake up sq thread.
2709 */
2710 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2711 wq_has_sleeper(&ctx->sq_data->wait))
2712 wake_up(&ctx->sq_data->wait);
2713
2714 mutex_unlock(&ctx->uring_lock);
2715 }
def596e9
JA
2716}
2717
4503b767
JA
2718static bool io_bdev_nowait(struct block_device *bdev)
2719{
9ba0d0c8 2720 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
4503b767
JA
2721}
2722
2b188cc1
JA
2723/*
2724 * If we tracked the file through the SCM inflight mechanism, we could support
2725 * any file. For now, just ensure that anything potentially problematic is done
2726 * inline.
2727 */
b191e2df 2728static bool __io_file_supports_nowait(struct file *file, int rw)
2b188cc1
JA
2729{
2730 umode_t mode = file_inode(file)->i_mode;
2731
4503b767 2732 if (S_ISBLK(mode)) {
4e7b5671
CH
2733 if (IS_ENABLED(CONFIG_BLOCK) &&
2734 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
4503b767
JA
2735 return true;
2736 return false;
2737 }
976517f1 2738 if (S_ISSOCK(mode))
2b188cc1 2739 return true;
4503b767 2740 if (S_ISREG(mode)) {
4e7b5671
CH
2741 if (IS_ENABLED(CONFIG_BLOCK) &&
2742 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
4503b767
JA
2743 file->f_op != &io_uring_fops)
2744 return true;
2745 return false;
2746 }
2b188cc1 2747
c5b85625
JA
2748 /* any ->read/write should understand O_NONBLOCK */
2749 if (file->f_flags & O_NONBLOCK)
2750 return true;
2751
af197f50
JA
2752 if (!(file->f_mode & FMODE_NOWAIT))
2753 return false;
2754
2755 if (rw == READ)
2756 return file->f_op->read_iter != NULL;
2757
2758 return file->f_op->write_iter != NULL;
2b188cc1
JA
2759}
2760
b191e2df 2761static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
7b29f92d 2762{
b191e2df 2763 if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
7b29f92d 2764 return true;
b191e2df 2765 else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
7b29f92d
JA
2766 return true;
2767
b191e2df 2768 return __io_file_supports_nowait(req->file, rw);
7b29f92d
JA
2769}
2770
5d329e12
JA
2771static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2772 int rw)
2b188cc1 2773{
def596e9 2774 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 2775 struct kiocb *kiocb = &req->rw.kiocb;
75c668cd 2776 struct file *file = req->file;
09bb8394
JA
2777 unsigned ioprio;
2778 int ret;
2b188cc1 2779
c97d8a0f 2780 if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
491381ce
JA
2781 req->flags |= REQ_F_ISREG;
2782
2b188cc1 2783 kiocb->ki_pos = READ_ONCE(sqe->off);
75c668cd 2784 if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
ba04291e 2785 req->flags |= REQ_F_CUR_POS;
75c668cd 2786 kiocb->ki_pos = file->f_pos;
ba04291e 2787 }
2b188cc1 2788 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
3e577dcd
PB
2789 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2790 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2791 if (unlikely(ret))
2792 return ret;
2b188cc1 2793
5d329e12
JA
2794 /*
2795 * If the file is marked O_NONBLOCK, still allow retry for it if it
2796 * supports async. Otherwise it's impossible to use O_NONBLOCK files
2797 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
2798 */
2799 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2800 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
75c668cd
PB
2801 req->flags |= REQ_F_NOWAIT;
2802
2b188cc1
JA
2803 ioprio = READ_ONCE(sqe->ioprio);
2804 if (ioprio) {
2805 ret = ioprio_check_cap(ioprio);
2806 if (ret)
09bb8394 2807 return ret;
2b188cc1
JA
2808
2809 kiocb->ki_ioprio = ioprio;
2810 } else
2811 kiocb->ki_ioprio = get_current_ioprio();
2812
def596e9 2813 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
2814 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2815 !kiocb->ki_filp->f_op->iopoll)
09bb8394 2816 return -EOPNOTSUPP;
2b188cc1 2817
394918eb 2818 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
def596e9 2819 kiocb->ki_complete = io_complete_rw_iopoll;
65a6543d 2820 req->iopoll_completed = 0;
def596e9 2821 } else {
09bb8394
JA
2822 if (kiocb->ki_flags & IOCB_HIPRI)
2823 return -EINVAL;
def596e9
JA
2824 kiocb->ki_complete = io_complete_rw;
2825 }
9adbd45d 2826
eae071c9
PB
2827 if (req->opcode == IORING_OP_READ_FIXED ||
2828 req->opcode == IORING_OP_WRITE_FIXED) {
2829 req->imu = NULL;
2830 io_req_set_rsrc_node(req);
2831 }
2832
3529d8c2
JA
2833 req->rw.addr = READ_ONCE(sqe->addr);
2834 req->rw.len = READ_ONCE(sqe->len);
4f4eeba8 2835 req->buf_index = READ_ONCE(sqe->buf_index);
2b188cc1 2836 return 0;
2b188cc1
JA
2837}
2838
2839static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2840{
2841 switch (ret) {
2842 case -EIOCBQUEUED:
2843 break;
2844 case -ERESTARTSYS:
2845 case -ERESTARTNOINTR:
2846 case -ERESTARTNOHAND:
2847 case -ERESTART_RESTARTBLOCK:
2848 /*
2849 * We can't just restart the syscall, since previously
2850 * submitted sqes may already be in progress. Just fail this
2851 * IO with EINTR.
2852 */
2853 ret = -EINTR;
df561f66 2854 fallthrough;
2b188cc1
JA
2855 default:
2856 kiocb->ki_complete(kiocb, ret, 0);
2857 }
2858}
2859
a1d7c393 2860static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
889fca73 2861 unsigned int issue_flags)
ba816ad6 2862{
ba04291e 2863 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
e8c2bc1f 2864 struct io_async_rw *io = req->async_data;
ba04291e 2865
227c0c96 2866 /* add previously done IO, if any */
d886e185 2867 if (req_has_async_data(req) && io->bytes_done > 0) {
227c0c96 2868 if (ret < 0)
e8c2bc1f 2869 ret = io->bytes_done;
227c0c96 2870 else
e8c2bc1f 2871 ret += io->bytes_done;
227c0c96
JA
2872 }
2873
ba04291e
JA
2874 if (req->flags & REQ_F_CUR_POS)
2875 req->file->f_pos = kiocb->ki_pos;
b66ceaf3 2876 if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
889fca73 2877 __io_complete_rw(req, ret, 0, issue_flags);
ba816ad6
JA
2878 else
2879 io_rw_done(kiocb, ret);
97284637 2880
b66ceaf3 2881 if (req->flags & REQ_F_REISSUE) {
97284637 2882 req->flags &= ~REQ_F_REISSUE;
a7be7c23 2883 if (io_resubmit_prep(req)) {
773af691 2884 io_req_task_queue_reissue(req);
8c130827 2885 } else {
b66ceaf3
PB
2886 unsigned int cflags = io_put_rw_kbuf(req);
2887 struct io_ring_ctx *ctx = req->ctx;
2888
93d2bcd2 2889 req_set_fail(req);
14cfbb7a 2890 if (!(issue_flags & IO_URING_F_NONBLOCK)) {
b66ceaf3
PB
2891 mutex_lock(&ctx->uring_lock);
2892 __io_req_complete(req, issue_flags, ret, cflags);
2893 mutex_unlock(&ctx->uring_lock);
2894 } else {
2895 __io_req_complete(req, issue_flags, ret, cflags);
2896 }
97284637
PB
2897 }
2898 }
ba816ad6
JA
2899}
2900
eae071c9
PB
2901static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
2902 struct io_mapped_ubuf *imu)
edafccee 2903{
9adbd45d 2904 size_t len = req->rw.len;
75769e3f 2905 u64 buf_end, buf_addr = req->rw.addr;
edafccee 2906 size_t offset;
edafccee 2907
75769e3f 2908 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
edafccee
JA
2909 return -EFAULT;
2910 /* not inside the mapped region */
4751f53d 2911 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
edafccee
JA
2912 return -EFAULT;
2913
2914 /*
2915 * May not be a start of buffer, set size appropriately
2916 * and advance us to the beginning.
2917 */
2918 offset = buf_addr - imu->ubuf;
2919 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
2920
2921 if (offset) {
2922 /*
2923 * Don't use iov_iter_advance() here, as it's really slow for
2924 * using the latter parts of a big fixed buffer - it iterates
2925 * over each segment manually. We can cheat a bit here, because
2926 * we know that:
2927 *
2928 * 1) it's a BVEC iter, we set it up
2929 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2930 * first and last bvec
2931 *
2932 * So just find our index, and adjust the iterator afterwards.
2933 * If the offset is within the first bvec (or the whole first
2934 * bvec, just use iov_iter_advance(). This makes it easier
2935 * since we can just skip the first segment, which may not
2936 * be PAGE_SIZE aligned.
2937 */
2938 const struct bio_vec *bvec = imu->bvec;
2939
2940 if (offset <= bvec->bv_len) {
2941 iov_iter_advance(iter, offset);
2942 } else {
2943 unsigned long seg_skip;
2944
2945 /* skip first vec */
2946 offset -= bvec->bv_len;
2947 seg_skip = 1 + (offset >> PAGE_SHIFT);
2948
2949 iter->bvec = bvec + seg_skip;
2950 iter->nr_segs -= seg_skip;
99c79f66 2951 iter->count -= bvec->bv_len + offset;
bd11b3a3 2952 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
2953 }
2954 }
2955
847595de 2956 return 0;
edafccee
JA
2957}
2958
eae071c9
PB
2959static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2960{
2961 struct io_ring_ctx *ctx = req->ctx;
2962 struct io_mapped_ubuf *imu = req->imu;
2963 u16 index, buf_index = req->buf_index;
2964
2965 if (likely(!imu)) {
2966 if (unlikely(buf_index >= ctx->nr_user_bufs))
2967 return -EFAULT;
2968 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2969 imu = READ_ONCE(ctx->user_bufs[index]);
2970 req->imu = imu;
2971 }
2972 return __io_import_fixed(req, rw, iter, imu);
2973}
2974
bcda7baa
JA
2975static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2976{
2977 if (needs_lock)
2978 mutex_unlock(&ctx->uring_lock);
2979}
2980
2981static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2982{
2983 /*
2984 * "Normal" inline submissions always hold the uring_lock, since we
2985 * grab it from the system call. Same is true for the SQPOLL offload.
2986 * The only exception is when we've detached the request and issue it
2987 * from an async worker thread, grab the lock for that case.
2988 */
2989 if (needs_lock)
2990 mutex_lock(&ctx->uring_lock);
2991}
2992
2993static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
30d51dd4 2994 int bgid, bool needs_lock)
bcda7baa 2995{
30d51dd4 2996 struct io_buffer *kbuf = req->kbuf;
bcda7baa
JA
2997 struct io_buffer *head;
2998
2999 if (req->flags & REQ_F_BUFFER_SELECTED)
3000 return kbuf;
3001
3002 io_ring_submit_lock(req->ctx, needs_lock);
3003
3004 lockdep_assert_held(&req->ctx->uring_lock);
3005
9e15c3a0 3006 head = xa_load(&req->ctx->io_buffers, bgid);
bcda7baa
JA
3007 if (head) {
3008 if (!list_empty(&head->list)) {
3009 kbuf = list_last_entry(&head->list, struct io_buffer,
3010 list);
3011 list_del(&kbuf->list);
3012 } else {
3013 kbuf = head;
9e15c3a0 3014 xa_erase(&req->ctx->io_buffers, bgid);
bcda7baa
JA
3015 }
3016 if (*len > kbuf->len)
3017 *len = kbuf->len;
30d51dd4
PB
3018 req->flags |= REQ_F_BUFFER_SELECTED;
3019 req->kbuf = kbuf;
bcda7baa
JA
3020 } else {
3021 kbuf = ERR_PTR(-ENOBUFS);
3022 }
3023
3024 io_ring_submit_unlock(req->ctx, needs_lock);
bcda7baa
JA
3025 return kbuf;
3026}
3027
4d954c25
JA
3028static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3029 bool needs_lock)
3030{
3031 struct io_buffer *kbuf;
4f4eeba8 3032 u16 bgid;
4d954c25 3033
4f4eeba8 3034 bgid = req->buf_index;
30d51dd4 3035 kbuf = io_buffer_select(req, len, bgid, needs_lock);
4d954c25
JA
3036 if (IS_ERR(kbuf))
3037 return kbuf;
4d954c25
JA
3038 return u64_to_user_ptr(kbuf->addr);
3039}
3040
3041#ifdef CONFIG_COMPAT
3042static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3043 bool needs_lock)
3044{
3045 struct compat_iovec __user *uiov;
3046 compat_ssize_t clen;
3047 void __user *buf;
3048 ssize_t len;
3049
3050 uiov = u64_to_user_ptr(req->rw.addr);
3051 if (!access_ok(uiov, sizeof(*uiov)))
3052 return -EFAULT;
3053 if (__get_user(clen, &uiov->iov_len))
3054 return -EFAULT;
3055 if (clen < 0)
3056 return -EINVAL;
3057
3058 len = clen;
3059 buf = io_rw_buffer_select(req, &len, needs_lock);
3060 if (IS_ERR(buf))
3061 return PTR_ERR(buf);
3062 iov[0].iov_base = buf;
3063 iov[0].iov_len = (compat_size_t) len;
3064 return 0;
3065}
3066#endif
3067
3068static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3069 bool needs_lock)
3070{
3071 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3072 void __user *buf;
3073 ssize_t len;
3074
3075 if (copy_from_user(iov, uiov, sizeof(*uiov)))
3076 return -EFAULT;
3077
3078 len = iov[0].iov_len;
3079 if (len < 0)
3080 return -EINVAL;
3081 buf = io_rw_buffer_select(req, &len, needs_lock);
3082 if (IS_ERR(buf))
3083 return PTR_ERR(buf);
3084 iov[0].iov_base = buf;
3085 iov[0].iov_len = len;
3086 return 0;
3087}
3088
3089static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3090 bool needs_lock)
3091{
dddb3e26 3092 if (req->flags & REQ_F_BUFFER_SELECTED) {
30d51dd4 3093 struct io_buffer *kbuf = req->kbuf;
dddb3e26 3094
dddb3e26
JA
3095 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3096 iov[0].iov_len = kbuf->len;
4d954c25 3097 return 0;
dddb3e26 3098 }
dd201662 3099 if (req->rw.len != 1)
4d954c25
JA
3100 return -EINVAL;
3101
3102#ifdef CONFIG_COMPAT
3103 if (req->ctx->compat)
3104 return io_compat_import(req, iov, needs_lock);
3105#endif
3106
3107 return __io_iov_buffer_select(req, iov, needs_lock);
3108}
3109
847595de
PB
3110static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
3111 struct iov_iter *iter, bool needs_lock)
2b188cc1 3112{
9adbd45d
JA
3113 void __user *buf = u64_to_user_ptr(req->rw.addr);
3114 size_t sqe_len = req->rw.len;
847595de 3115 u8 opcode = req->opcode;
4d954c25 3116 ssize_t ret;
edafccee 3117
7d009165 3118 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 3119 *iovec = NULL;
9adbd45d 3120 return io_import_fixed(req, rw, iter);
edafccee 3121 }
2b188cc1 3122
bcda7baa 3123 /* buffer index only valid with fixed read/write, or buffer select */
4f4eeba8 3124 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
9adbd45d
JA
3125 return -EINVAL;
3126
3a6820f2 3127 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
bcda7baa 3128 if (req->flags & REQ_F_BUFFER_SELECT) {
4d954c25 3129 buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
867a23ea 3130 if (IS_ERR(buf))
4d954c25 3131 return PTR_ERR(buf);
3f9d6441 3132 req->rw.len = sqe_len;
bcda7baa
JA
3133 }
3134
3a6820f2
JA
3135 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3136 *iovec = NULL;
10fc72e4 3137 return ret;
3a6820f2
JA
3138 }
3139
4d954c25
JA
3140 if (req->flags & REQ_F_BUFFER_SELECT) {
3141 ret = io_iov_buffer_select(req, *iovec, needs_lock);
847595de
PB
3142 if (!ret)
3143 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
4d954c25
JA
3144 *iovec = NULL;
3145 return ret;
3146 }
3147
89cd35c5
CH
3148 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3149 req->ctx->compat);
2b188cc1
JA
3150}
3151
0fef9483
JA
3152static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3153{
5b09e37e 3154 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
0fef9483
JA
3155}
3156
31b51510 3157/*
32960613
JA
3158 * For files that don't have ->read_iter() and ->write_iter(), handle them
3159 * by looping over ->read() or ->write() manually.
31b51510 3160 */
4017eb91 3161static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
32960613 3162{
4017eb91
JA
3163 struct kiocb *kiocb = &req->rw.kiocb;
3164 struct file *file = req->file;
32960613
JA
3165 ssize_t ret = 0;
3166
3167 /*
3168 * Don't support polled IO through this interface, and we can't
3169 * support non-blocking either. For the latter, this just causes
3170 * the kiocb to be handled from an async context.
3171 */
3172 if (kiocb->ki_flags & IOCB_HIPRI)
3173 return -EOPNOTSUPP;
3174 if (kiocb->ki_flags & IOCB_NOWAIT)
3175 return -EAGAIN;
3176
3177 while (iov_iter_count(iter)) {
311ae9e1 3178 struct iovec iovec;
32960613
JA
3179 ssize_t nr;
3180
311ae9e1
PB
3181 if (!iov_iter_is_bvec(iter)) {
3182 iovec = iov_iter_iovec(iter);
3183 } else {
4017eb91
JA
3184 iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3185 iovec.iov_len = req->rw.len;
311ae9e1
PB
3186 }
3187
32960613
JA
3188 if (rw == READ) {
3189 nr = file->f_op->read(file, iovec.iov_base,
0fef9483 3190 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3191 } else {
3192 nr = file->f_op->write(file, iovec.iov_base,
0fef9483 3193 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3194 }
3195
3196 if (nr < 0) {
3197 if (!ret)
3198 ret = nr;
3199 break;
3200 }
16c8d2df
JA
3201 if (!iov_iter_is_bvec(iter)) {
3202 iov_iter_advance(iter, nr);
3203 } else {
3204 req->rw.len -= nr;
3205 req->rw.addr += nr;
3206 }
32960613
JA
3207 ret += nr;
3208 if (nr != iovec.iov_len)
3209 break;
32960613
JA
3210 }
3211
3212 return ret;
3213}
3214
ff6165b2
JA
3215static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3216 const struct iovec *fast_iov, struct iov_iter *iter)
f67676d1 3217{
e8c2bc1f 3218 struct io_async_rw *rw = req->async_data;
b64e3444 3219
ff6165b2 3220 memcpy(&rw->iter, iter, sizeof(*iter));
afb87658 3221 rw->free_iovec = iovec;
227c0c96 3222 rw->bytes_done = 0;
ff6165b2 3223 /* can only be fixed buffers, no need to do anything */
9c3a205c 3224 if (iov_iter_is_bvec(iter))
ff6165b2 3225 return;
b64e3444 3226 if (!iovec) {
ff6165b2
JA
3227 unsigned iov_off = 0;
3228
3229 rw->iter.iov = rw->fast_iov;
3230 if (iter->iov != fast_iov) {
3231 iov_off = iter->iov - fast_iov;
3232 rw->iter.iov += iov_off;
3233 }
3234 if (rw->fast_iov != fast_iov)
3235 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
45097dae 3236 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
3237 } else {
3238 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
3239 }
3240}
3241
8d4af685 3242static inline bool io_alloc_async_data(struct io_kiocb *req)
3d9932a8 3243{
e8c2bc1f
JA
3244 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3245 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
d886e185
PB
3246 if (req->async_data) {
3247 req->flags |= REQ_F_ASYNC_DATA;
3248 return false;
3249 }
3250 return true;
3d9932a8
XW
3251}
3252
ff6165b2
JA
3253static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3254 const struct iovec *fast_iov,
227c0c96 3255 struct iov_iter *iter, bool force)
b7bb4f7d 3256{
26f0505a 3257 if (!force && !io_op_defs[req->opcode].needs_async_setup)
74566df3 3258 return 0;
d886e185 3259 if (!req_has_async_data(req)) {
cd658695
JA
3260 struct io_async_rw *iorw;
3261
6cb78689 3262 if (io_alloc_async_data(req)) {
6bf985dc 3263 kfree(iovec);
5d204bcf 3264 return -ENOMEM;
6bf985dc 3265 }
b7bb4f7d 3266
ff6165b2 3267 io_req_map_rw(req, iovec, fast_iov, iter);
cd658695
JA
3268 iorw = req->async_data;
3269 /* we've copied and mapped the iter, ensure state is saved */
3270 iov_iter_save_state(&iorw->iter, &iorw->iter_state);
5d204bcf 3271 }
b7bb4f7d 3272 return 0;
f67676d1
JA
3273}
3274
73debe68 3275static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
c3e330a4 3276{
e8c2bc1f 3277 struct io_async_rw *iorw = req->async_data;
f4bff104 3278 struct iovec *iov = iorw->fast_iov;
847595de 3279 int ret;
c3e330a4 3280
2846c481 3281 ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
c3e330a4
PB
3282 if (unlikely(ret < 0))
3283 return ret;
3284
ab0b196c
PB
3285 iorw->bytes_done = 0;
3286 iorw->free_iovec = iov;
3287 if (iov)
3288 req->flags |= REQ_F_NEED_CLEANUP;
cd658695 3289 iov_iter_save_state(&iorw->iter, &iorw->iter_state);
c3e330a4
PB
3290 return 0;
3291}
3292
73debe68 3293static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3294{
3529d8c2
JA
3295 if (unlikely(!(req->file->f_mode & FMODE_READ)))
3296 return -EBADF;
5d329e12 3297 return io_prep_rw(req, sqe, READ);
f67676d1
JA
3298}
3299
c1dd91d1
JA
3300/*
3301 * This is our waitqueue callback handler, registered through lock_page_async()
3302 * when we initially tried to do the IO with the iocb armed our waitqueue.
3303 * This gets called when the page is unlocked, and we generally expect that to
3304 * happen when the page IO is completed and the page is now uptodate. This will
3305 * queue a task_work based retry of the operation, attempting to copy the data
3306 * again. If the latter fails because the page was NOT uptodate, then we will
3307 * do a thread based blocking retry of the operation. That's the unexpected
3308 * slow path.
3309 */
bcf5a063
JA
3310static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3311 int sync, void *arg)
3312{
3313 struct wait_page_queue *wpq;
3314 struct io_kiocb *req = wait->private;
bcf5a063 3315 struct wait_page_key *key = arg;
bcf5a063
JA
3316
3317 wpq = container_of(wait, struct wait_page_queue, wait);
3318
cdc8fcb4
LT
3319 if (!wake_page_match(wpq, key))
3320 return 0;
3321
c8d317aa 3322 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
bcf5a063 3323 list_del_init(&wait->entry);
921b9054 3324 io_req_task_queue(req);
bcf5a063
JA
3325 return 1;
3326}
3327
c1dd91d1
JA
3328/*
3329 * This controls whether a given IO request should be armed for async page
3330 * based retry. If we return false here, the request is handed to the async
3331 * worker threads for retry. If we're doing buffered reads on a regular file,
3332 * we prepare a private wait_page_queue entry and retry the operation. This
3333 * will either succeed because the page is now uptodate and unlocked, or it
3334 * will register a callback when the page is unlocked at IO completion. Through
3335 * that callback, io_uring uses task_work to setup a retry of the operation.
3336 * That retry will attempt the buffered read again. The retry will generally
3337 * succeed, or in rare cases where it fails, we then fall back to using the
3338 * async worker threads for a blocking retry.
3339 */
227c0c96 3340static bool io_rw_should_retry(struct io_kiocb *req)
f67676d1 3341{
e8c2bc1f
JA
3342 struct io_async_rw *rw = req->async_data;
3343 struct wait_page_queue *wait = &rw->wpq;
bcf5a063 3344 struct kiocb *kiocb = &req->rw.kiocb;
f67676d1 3345
bcf5a063
JA
3346 /* never retry for NOWAIT, we just complete with -EAGAIN */
3347 if (req->flags & REQ_F_NOWAIT)
3348 return false;
f67676d1 3349
227c0c96 3350 /* Only for buffered IO */
3b2a4439 3351 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
bcf5a063 3352 return false;
3b2a4439 3353
bcf5a063
JA
3354 /*
3355 * just use poll if we can, and don't attempt if the fs doesn't
3356 * support callback based unlocks
3357 */
3358 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3359 return false;
f67676d1 3360
3b2a4439
JA
3361 wait->wait.func = io_async_buf_func;
3362 wait->wait.private = req;
3363 wait->wait.flags = 0;
3364 INIT_LIST_HEAD(&wait->wait.entry);
3365 kiocb->ki_flags |= IOCB_WAITQ;
c8d317aa 3366 kiocb->ki_flags &= ~IOCB_NOWAIT;
3b2a4439 3367 kiocb->ki_waitq = wait;
3b2a4439 3368 return true;
bcf5a063
JA
3369}
3370
aeab9506 3371static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
bcf5a063
JA
3372{
3373 if (req->file->f_op->read_iter)
3374 return call_read_iter(req->file, &req->rw.kiocb, iter);
2dd2111d 3375 else if (req->file->f_op->read)
4017eb91 3376 return loop_rw_iter(READ, req, iter);
2dd2111d
GH
3377 else
3378 return -EINVAL;
f67676d1
JA
3379}
3380
7db30437
ML
3381static bool need_read_all(struct io_kiocb *req)
3382{
3383 return req->flags & REQ_F_ISREG ||
3384 S_ISBLK(file_inode(req->file)->i_mode);
3385}
3386
889fca73 3387static int io_read(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3388{
3389 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3390 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3391 struct iov_iter __iter, *iter = &__iter;
45d189c6 3392 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
cd658695 3393 struct iov_iter_state __state, *state;
d886e185 3394 struct io_async_rw *rw;
cd658695 3395 ssize_t ret, ret2;
ff6165b2 3396
d886e185
PB
3397 if (req_has_async_data(req)) {
3398 rw = req->async_data;
e8c2bc1f 3399 iter = &rw->iter;
cd658695
JA
3400 state = &rw->iter_state;
3401 /*
3402 * We come here from an earlier attempt, restore our state to
3403 * match in case it doesn't. It's cheap enough that we don't
3404 * need to make this conditional.
3405 */
3406 iov_iter_restore(iter, state);
2846c481
PB
3407 iovec = NULL;
3408 } else {
3409 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3410 if (ret < 0)
3411 return ret;
cd658695
JA
3412 state = &__state;
3413 iov_iter_save_state(iter, state);
2846c481 3414 }
cd658695 3415 req->result = iov_iter_count(iter);
2b188cc1 3416
fd6c2e4c
JA
3417 /* Ensure we clear previously set non-block flag */
3418 if (!force_nonblock)
29de5f6a 3419 kiocb->ki_flags &= ~IOCB_NOWAIT;
a88fc400
PB
3420 else
3421 kiocb->ki_flags |= IOCB_NOWAIT;
3422
24c74678 3423 /* If the file doesn't support async, just async punt */
b191e2df 3424 if (force_nonblock && !io_file_supports_nowait(req, READ)) {
6713e7a6 3425 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc 3426 return ret ?: -EAGAIN;
6713e7a6 3427 }
9e645e11 3428
cd658695 3429 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
5ea5dd45
PB
3430 if (unlikely(ret)) {
3431 kfree(iovec);
3432 return ret;
3433 }
2b188cc1 3434
227c0c96 3435 ret = io_iter_do_read(req, iter);
32960613 3436
230d50d4 3437 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
6ad7f233 3438 req->flags &= ~REQ_F_REISSUE;
eefdf30f
JA
3439 /* IOPOLL retry should happen for io-wq threads */
3440 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
f91daf56 3441 goto done;
75c668cd
PB
3442 /* no retry on NONBLOCK nor RWF_NOWAIT */
3443 if (req->flags & REQ_F_NOWAIT)
355afaeb 3444 goto done;
f38c7e3a 3445 ret = 0;
230d50d4
JA
3446 } else if (ret == -EIOCBQUEUED) {
3447 goto out_free;
cd658695 3448 } else if (ret <= 0 || ret == req->result || !force_nonblock ||
7db30437 3449 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
7335e3bf 3450 /* read all, failed, already did sync or don't want to retry */
00d23d51 3451 goto done;
227c0c96
JA
3452 }
3453
cd658695
JA
3454 /*
3455 * Don't depend on the iter state matching what was consumed, or being
3456 * untouched in case of error. Restore it and we'll advance it
3457 * manually if we need to.
3458 */
3459 iov_iter_restore(iter, state);
3460
227c0c96 3461 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc
PB
3462 if (ret2)
3463 return ret2;
3464
fe1cdd55 3465 iovec = NULL;
e8c2bc1f 3466 rw = req->async_data;
cd658695
JA
3467 /*
3468 * Now use our persistent iterator and state, if we aren't already.
3469 * We've restored and mapped the iter to match.
3470 */
3471 if (iter != &rw->iter) {
3472 iter = &rw->iter;
3473 state = &rw->iter_state;
3474 }
227c0c96 3475
b23df91b 3476 do {
cd658695
JA
3477 /*
3478 * We end up here because of a partial read, either from
3479 * above or inside this loop. Advance the iter by the bytes
3480 * that were consumed.
3481 */
3482 iov_iter_advance(iter, ret);
3483 if (!iov_iter_count(iter))
3484 break;
b23df91b 3485 rw->bytes_done += ret;
cd658695
JA
3486 iov_iter_save_state(iter, state);
3487
b23df91b
PB
3488 /* if we can retry, do so with the callbacks armed */
3489 if (!io_rw_should_retry(req)) {
3490 kiocb->ki_flags &= ~IOCB_WAITQ;
3491 return -EAGAIN;
3492 }
3493
3494 /*
3495 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3496 * we get -EIOCBQUEUED, then we'll get a notification when the
3497 * desired page gets unlocked. We can also get a partial read
3498 * here, and if we do, then just retry at the new offset.
3499 */
3500 ret = io_iter_do_read(req, iter);
3501 if (ret == -EIOCBQUEUED)
3502 return 0;
227c0c96 3503 /* we got some bytes, but not all. retry. */
b5b0ecb7 3504 kiocb->ki_flags &= ~IOCB_WAITQ;
cd658695
JA
3505 iov_iter_restore(iter, state);
3506 } while (ret > 0);
227c0c96 3507done:
889fca73 3508 kiocb_done(kiocb, ret, issue_flags);
fe1cdd55
PB
3509out_free:
3510 /* it's faster to check here then delegate to kfree */
3511 if (iovec)
3512 kfree(iovec);
5ea5dd45 3513 return 0;
2b188cc1
JA
3514}
3515
73debe68 3516static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3517{
3529d8c2
JA
3518 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3519 return -EBADF;
5d329e12 3520 return io_prep_rw(req, sqe, WRITE);
f67676d1
JA
3521}
3522
889fca73 3523static int io_write(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3524{
3525 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3526 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3527 struct iov_iter __iter, *iter = &__iter;
45d189c6 3528 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
cd658695 3529 struct iov_iter_state __state, *state;
d886e185 3530 struct io_async_rw *rw;
cd658695 3531 ssize_t ret, ret2;
2b188cc1 3532
d886e185
PB
3533 if (req_has_async_data(req)) {
3534 rw = req->async_data;
e8c2bc1f 3535 iter = &rw->iter;
cd658695
JA
3536 state = &rw->iter_state;
3537 iov_iter_restore(iter, state);
2846c481
PB
3538 iovec = NULL;
3539 } else {
3540 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3541 if (ret < 0)
3542 return ret;
cd658695
JA
3543 state = &__state;
3544 iov_iter_save_state(iter, state);
2846c481 3545 }
cd658695 3546 req->result = iov_iter_count(iter);
2b188cc1 3547
fd6c2e4c
JA
3548 /* Ensure we clear previously set non-block flag */
3549 if (!force_nonblock)
a88fc400
PB
3550 kiocb->ki_flags &= ~IOCB_NOWAIT;
3551 else
3552 kiocb->ki_flags |= IOCB_NOWAIT;
fd6c2e4c 3553
24c74678 3554 /* If the file doesn't support async, just async punt */
b191e2df 3555 if (force_nonblock && !io_file_supports_nowait(req, WRITE))
f67676d1 3556 goto copy_iov;
31b51510 3557
10d59345
JA
3558 /* file path doesn't support NOWAIT for non-direct_IO */
3559 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3560 (req->flags & REQ_F_ISREG))
f67676d1 3561 goto copy_iov;
31b51510 3562
cd658695 3563 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
fa15bafb
PB
3564 if (unlikely(ret))
3565 goto out_free;
4ed734b0 3566
fa15bafb
PB
3567 /*
3568 * Open-code file_start_write here to grab freeze protection,
3569 * which will be released by another thread in
3570 * io_complete_rw(). Fool lockdep by telling it the lock got
3571 * released so that it doesn't complain about the held lock when
3572 * we return to userspace.
3573 */
3574 if (req->flags & REQ_F_ISREG) {
8a3c84b6 3575 sb_start_write(file_inode(req->file)->i_sb);
fa15bafb
PB
3576 __sb_writers_release(file_inode(req->file)->i_sb,
3577 SB_FREEZE_WRITE);
3578 }
3579 kiocb->ki_flags |= IOCB_WRITE;
4ed734b0 3580
fa15bafb 3581 if (req->file->f_op->write_iter)
ff6165b2 3582 ret2 = call_write_iter(req->file, kiocb, iter);
2dd2111d 3583 else if (req->file->f_op->write)
4017eb91 3584 ret2 = loop_rw_iter(WRITE, req, iter);
2dd2111d
GH
3585 else
3586 ret2 = -EINVAL;
4ed734b0 3587
6ad7f233
PB
3588 if (req->flags & REQ_F_REISSUE) {
3589 req->flags &= ~REQ_F_REISSUE;
230d50d4 3590 ret2 = -EAGAIN;
6ad7f233 3591 }
230d50d4 3592
fa15bafb
PB
3593 /*
3594 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3595 * retry them without IOCB_NOWAIT.
3596 */
3597 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3598 ret2 = -EAGAIN;
75c668cd
PB
3599 /* no retry on NONBLOCK nor RWF_NOWAIT */
3600 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
355afaeb 3601 goto done;
fa15bafb 3602 if (!force_nonblock || ret2 != -EAGAIN) {
eefdf30f
JA
3603 /* IOPOLL retry should happen for io-wq threads */
3604 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3605 goto copy_iov;
355afaeb 3606done:
889fca73 3607 kiocb_done(kiocb, ret2, issue_flags);
fa15bafb 3608 } else {
f67676d1 3609copy_iov:
cd658695 3610 iov_iter_restore(iter, state);
227c0c96 3611 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
6bf985dc 3612 return ret ?: -EAGAIN;
2b188cc1 3613 }
31b51510 3614out_free:
f261c168 3615 /* it's reportedly faster than delegating the null check to kfree() */
252917c3 3616 if (iovec)
6f2cc166 3617 kfree(iovec);
2b188cc1
JA
3618 return ret;
3619}
3620
80a261fd
JA
3621static int io_renameat_prep(struct io_kiocb *req,
3622 const struct io_uring_sqe *sqe)
3623{
3624 struct io_rename *ren = &req->rename;
3625 const char __user *oldf, *newf;
3626
ed7eb259
JA
3627 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3628 return -EINVAL;
26578cda 3629 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
ed7eb259 3630 return -EINVAL;
80a261fd
JA
3631 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3632 return -EBADF;
3633
3634 ren->old_dfd = READ_ONCE(sqe->fd);
3635 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3636 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3637 ren->new_dfd = READ_ONCE(sqe->len);
3638 ren->flags = READ_ONCE(sqe->rename_flags);
3639
3640 ren->oldpath = getname(oldf);
3641 if (IS_ERR(ren->oldpath))
3642 return PTR_ERR(ren->oldpath);
3643
3644 ren->newpath = getname(newf);
3645 if (IS_ERR(ren->newpath)) {
3646 putname(ren->oldpath);
3647 return PTR_ERR(ren->newpath);
3648 }
3649
3650 req->flags |= REQ_F_NEED_CLEANUP;
3651 return 0;
3652}
3653
45d189c6 3654static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
80a261fd
JA
3655{
3656 struct io_rename *ren = &req->rename;
3657 int ret;
3658
45d189c6 3659 if (issue_flags & IO_URING_F_NONBLOCK)
80a261fd
JA
3660 return -EAGAIN;
3661
3662 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3663 ren->newpath, ren->flags);
3664
3665 req->flags &= ~REQ_F_NEED_CLEANUP;
3666 if (ret < 0)
93d2bcd2 3667 req_set_fail(req);
80a261fd
JA
3668 io_req_complete(req, ret);
3669 return 0;
3670}
3671
14a1143b
JA
3672static int io_unlinkat_prep(struct io_kiocb *req,
3673 const struct io_uring_sqe *sqe)
3674{
3675 struct io_unlink *un = &req->unlink;
3676 const char __user *fname;
3677
22634bc5
JA
3678 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3679 return -EINVAL;
26578cda
PB
3680 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3681 sqe->splice_fd_in)
22634bc5 3682 return -EINVAL;
14a1143b
JA
3683 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3684 return -EBADF;
3685
3686 un->dfd = READ_ONCE(sqe->fd);
3687
3688 un->flags = READ_ONCE(sqe->unlink_flags);
3689 if (un->flags & ~AT_REMOVEDIR)
3690 return -EINVAL;
3691
3692 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3693 un->filename = getname(fname);
3694 if (IS_ERR(un->filename))
3695 return PTR_ERR(un->filename);
3696
3697 req->flags |= REQ_F_NEED_CLEANUP;
3698 return 0;
3699}
3700
45d189c6 3701static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
14a1143b
JA
3702{
3703 struct io_unlink *un = &req->unlink;
3704 int ret;
3705
45d189c6 3706 if (issue_flags & IO_URING_F_NONBLOCK)
14a1143b
JA
3707 return -EAGAIN;
3708
3709 if (un->flags & AT_REMOVEDIR)
3710 ret = do_rmdir(un->dfd, un->filename);
3711 else
3712 ret = do_unlinkat(un->dfd, un->filename);
3713
3714 req->flags &= ~REQ_F_NEED_CLEANUP;
3715 if (ret < 0)
93d2bcd2 3716 req_set_fail(req);
14a1143b
JA
3717 io_req_complete(req, ret);
3718 return 0;
3719}
3720
e34a02dc
DK
3721static int io_mkdirat_prep(struct io_kiocb *req,
3722 const struct io_uring_sqe *sqe)
3723{
3724 struct io_mkdir *mkd = &req->mkdir;
3725 const char __user *fname;
3726
3727 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3728 return -EINVAL;
3729 if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
3730 sqe->splice_fd_in)
3731 return -EINVAL;
3732 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3733 return -EBADF;
3734
3735 mkd->dfd = READ_ONCE(sqe->fd);
3736 mkd->mode = READ_ONCE(sqe->len);
3737
3738 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3739 mkd->filename = getname(fname);
3740 if (IS_ERR(mkd->filename))
3741 return PTR_ERR(mkd->filename);
3742
3743 req->flags |= REQ_F_NEED_CLEANUP;
3744 return 0;
3745}
3746
3747static int io_mkdirat(struct io_kiocb *req, int issue_flags)
3748{
3749 struct io_mkdir *mkd = &req->mkdir;
3750 int ret;
3751
3752 if (issue_flags & IO_URING_F_NONBLOCK)
3753 return -EAGAIN;
3754
3755 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
3756
3757 req->flags &= ~REQ_F_NEED_CLEANUP;
3758 if (ret < 0)
3759 req_set_fail(req);
3760 io_req_complete(req, ret);
3761 return 0;
3762}
3763
7a8721f8
DK
3764static int io_symlinkat_prep(struct io_kiocb *req,
3765 const struct io_uring_sqe *sqe)
3766{
3767 struct io_symlink *sl = &req->symlink;
3768 const char __user *oldpath, *newpath;
3769
3770 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3771 return -EINVAL;
3772 if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
3773 sqe->splice_fd_in)
3774 return -EINVAL;
3775 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3776 return -EBADF;
3777
3778 sl->new_dfd = READ_ONCE(sqe->fd);
3779 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
3780 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3781
3782 sl->oldpath = getname(oldpath);
3783 if (IS_ERR(sl->oldpath))
3784 return PTR_ERR(sl->oldpath);
3785
3786 sl->newpath = getname(newpath);
3787 if (IS_ERR(sl->newpath)) {
3788 putname(sl->oldpath);
3789 return PTR_ERR(sl->newpath);
3790 }
3791
3792 req->flags |= REQ_F_NEED_CLEANUP;
3793 return 0;
3794}
3795
3796static int io_symlinkat(struct io_kiocb *req, int issue_flags)
3797{
3798 struct io_symlink *sl = &req->symlink;
3799 int ret;
3800
3801 if (issue_flags & IO_URING_F_NONBLOCK)
3802 return -EAGAIN;
3803
3804 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
3805
3806 req->flags &= ~REQ_F_NEED_CLEANUP;
3807 if (ret < 0)
3808 req_set_fail(req);
3809 io_req_complete(req, ret);
3810 return 0;
3811}
3812
cf30da90
DK
3813static int io_linkat_prep(struct io_kiocb *req,
3814 const struct io_uring_sqe *sqe)
3815{
3816 struct io_hardlink *lnk = &req->hardlink;
3817 const char __user *oldf, *newf;
3818
3819 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3820 return -EINVAL;
3821 if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
3822 return -EINVAL;
3823 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3824 return -EBADF;
3825
3826 lnk->old_dfd = READ_ONCE(sqe->fd);
3827 lnk->new_dfd = READ_ONCE(sqe->len);
3828 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3829 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3830 lnk->flags = READ_ONCE(sqe->hardlink_flags);
3831
3832 lnk->oldpath = getname(oldf);
3833 if (IS_ERR(lnk->oldpath))
3834 return PTR_ERR(lnk->oldpath);
3835
3836 lnk->newpath = getname(newf);
3837 if (IS_ERR(lnk->newpath)) {
3838 putname(lnk->oldpath);
3839 return PTR_ERR(lnk->newpath);
3840 }
3841
3842 req->flags |= REQ_F_NEED_CLEANUP;
3843 return 0;
3844}
3845
3846static int io_linkat(struct io_kiocb *req, int issue_flags)
3847{
3848 struct io_hardlink *lnk = &req->hardlink;
3849 int ret;
3850
3851 if (issue_flags & IO_URING_F_NONBLOCK)
3852 return -EAGAIN;
3853
3854 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
3855 lnk->newpath, lnk->flags);
3856
3857 req->flags &= ~REQ_F_NEED_CLEANUP;
3858 if (ret < 0)
3859 req_set_fail(req);
3860 io_req_complete(req, ret);
3861 return 0;
3862}
3863
36f4fa68
JA
3864static int io_shutdown_prep(struct io_kiocb *req,
3865 const struct io_uring_sqe *sqe)
3866{
3867#if defined(CONFIG_NET)
3868 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3869 return -EINVAL;
26578cda
PB
3870 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3871 sqe->buf_index || sqe->splice_fd_in))
36f4fa68
JA
3872 return -EINVAL;
3873
3874 req->shutdown.how = READ_ONCE(sqe->len);
3875 return 0;
3876#else
3877 return -EOPNOTSUPP;
3878#endif
3879}
3880
45d189c6 3881static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
36f4fa68
JA
3882{
3883#if defined(CONFIG_NET)
3884 struct socket *sock;
3885 int ret;
3886
45d189c6 3887 if (issue_flags & IO_URING_F_NONBLOCK)
36f4fa68
JA
3888 return -EAGAIN;
3889
48aba79b 3890 sock = sock_from_file(req->file);
36f4fa68 3891 if (unlikely(!sock))
48aba79b 3892 return -ENOTSOCK;
36f4fa68
JA
3893
3894 ret = __sys_shutdown_sock(sock, req->shutdown.how);
a146468d 3895 if (ret < 0)
93d2bcd2 3896 req_set_fail(req);
36f4fa68
JA
3897 io_req_complete(req, ret);
3898 return 0;
3899#else
3900 return -EOPNOTSUPP;
3901#endif
3902}
3903
f2a8d5c7
PB
3904static int __io_splice_prep(struct io_kiocb *req,
3905 const struct io_uring_sqe *sqe)
7d67af2c 3906{
fe7e3257 3907 struct io_splice *sp = &req->splice;
7d67af2c 3908 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
7d67af2c 3909
3232dd02
PB
3910 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3911 return -EINVAL;
7d67af2c
PB
3912
3913 sp->file_in = NULL;
7d67af2c
PB
3914 sp->len = READ_ONCE(sqe->len);
3915 sp->flags = READ_ONCE(sqe->splice_flags);
3916
3917 if (unlikely(sp->flags & ~valid_flags))
3918 return -EINVAL;
3919
62906e89 3920 sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
8371adf5
PB
3921 (sp->flags & SPLICE_F_FD_IN_FIXED));
3922 if (!sp->file_in)
3923 return -EBADF;
7d67af2c 3924 req->flags |= REQ_F_NEED_CLEANUP;
7d67af2c
PB
3925 return 0;
3926}
3927
f2a8d5c7
PB
3928static int io_tee_prep(struct io_kiocb *req,
3929 const struct io_uring_sqe *sqe)
3930{
3931 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3932 return -EINVAL;
3933 return __io_splice_prep(req, sqe);
3934}
3935
45d189c6 3936static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
f2a8d5c7
PB
3937{
3938 struct io_splice *sp = &req->splice;
3939 struct file *in = sp->file_in;
3940 struct file *out = sp->file_out;
3941 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3942 long ret = 0;
3943
45d189c6 3944 if (issue_flags & IO_URING_F_NONBLOCK)
f2a8d5c7
PB
3945 return -EAGAIN;
3946 if (sp->len)
3947 ret = do_tee(in, out, sp->len, flags);
3948
e1d767f0
PB
3949 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3950 io_put_file(in);
f2a8d5c7
PB
3951 req->flags &= ~REQ_F_NEED_CLEANUP;
3952
f2a8d5c7 3953 if (ret != sp->len)
93d2bcd2 3954 req_set_fail(req);
e1e16097 3955 io_req_complete(req, ret);
f2a8d5c7
PB
3956 return 0;
3957}
3958
3959static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3960{
fe7e3257 3961 struct io_splice *sp = &req->splice;
f2a8d5c7
PB
3962
3963 sp->off_in = READ_ONCE(sqe->splice_off_in);
3964 sp->off_out = READ_ONCE(sqe->off);
3965 return __io_splice_prep(req, sqe);
3966}
3967
45d189c6 3968static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
7d67af2c
PB
3969{
3970 struct io_splice *sp = &req->splice;
3971 struct file *in = sp->file_in;
3972 struct file *out = sp->file_out;
3973 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3974 loff_t *poff_in, *poff_out;
c9687426 3975 long ret = 0;
7d67af2c 3976
45d189c6 3977 if (issue_flags & IO_URING_F_NONBLOCK)
2fb3e822 3978 return -EAGAIN;
7d67af2c
PB
3979
3980 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3981 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
c9687426 3982
948a7749 3983 if (sp->len)
c9687426 3984 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
7d67af2c 3985
e1d767f0
PB
3986 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3987 io_put_file(in);
7d67af2c
PB
3988 req->flags &= ~REQ_F_NEED_CLEANUP;
3989
7d67af2c 3990 if (ret != sp->len)
93d2bcd2 3991 req_set_fail(req);
e1e16097 3992 io_req_complete(req, ret);
7d67af2c
PB
3993 return 0;
3994}
3995
2b188cc1
JA
3996/*
3997 * IORING_OP_NOP just posts a completion event, nothing else.
3998 */
889fca73 3999static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
4000{
4001 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 4002
def596e9
JA
4003 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4004 return -EINVAL;
4005
889fca73 4006 __io_req_complete(req, issue_flags, 0, 0);
2b188cc1
JA
4007 return 0;
4008}
4009
1155c76a 4010static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 4011{
6b06314c 4012 struct io_ring_ctx *ctx = req->ctx;
c992fe29 4013
09bb8394
JA
4014 if (!req->file)
4015 return -EBADF;
c992fe29 4016
6b06314c 4017 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 4018 return -EINVAL;
26578cda
PB
4019 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4020 sqe->splice_fd_in))
c992fe29
CH
4021 return -EINVAL;
4022
8ed8d3c3
JA
4023 req->sync.flags = READ_ONCE(sqe->fsync_flags);
4024 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4025 return -EINVAL;
4026
4027 req->sync.off = READ_ONCE(sqe->off);
4028 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
4029 return 0;
4030}
4031
45d189c6 4032static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4033{
8ed8d3c3 4034 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
4035 int ret;
4036
ac45abc0 4037 /* fsync always requires a blocking context */
45d189c6 4038 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4039 return -EAGAIN;
4040
9adbd45d 4041 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
4042 end > 0 ? end : LLONG_MAX,
4043 req->sync.flags & IORING_FSYNC_DATASYNC);
4044 if (ret < 0)
93d2bcd2 4045 req_set_fail(req);
e1e16097 4046 io_req_complete(req, ret);
c992fe29
CH
4047 return 0;
4048}
4049
d63d1b5e
JA
4050static int io_fallocate_prep(struct io_kiocb *req,
4051 const struct io_uring_sqe *sqe)
4052{
26578cda
PB
4053 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4054 sqe->splice_fd_in)
d63d1b5e 4055 return -EINVAL;
3232dd02
PB
4056 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4057 return -EINVAL;
d63d1b5e
JA
4058
4059 req->sync.off = READ_ONCE(sqe->off);
4060 req->sync.len = READ_ONCE(sqe->addr);
4061 req->sync.mode = READ_ONCE(sqe->len);
4062 return 0;
4063}
4064
45d189c6 4065static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
5d17b4a4 4066{
ac45abc0
PB
4067 int ret;
4068
d63d1b5e 4069 /* fallocate always requiring blocking context */
45d189c6 4070 if (issue_flags & IO_URING_F_NONBLOCK)
5d17b4a4 4071 return -EAGAIN;
ac45abc0
PB
4072 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4073 req->sync.len);
ac45abc0 4074 if (ret < 0)
93d2bcd2 4075 req_set_fail(req);
e1e16097 4076 io_req_complete(req, ret);
5d17b4a4
JA
4077 return 0;
4078}
4079
ec65fea5 4080static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 4081{
f8748881 4082 const char __user *fname;
15b71abe 4083 int ret;
b7bb4f7d 4084
d3fddf6d
PB
4085 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4086 return -EINVAL;
b9445598 4087 if (unlikely(sqe->ioprio || sqe->buf_index))
15b71abe 4088 return -EINVAL;
ec65fea5 4089 if (unlikely(req->flags & REQ_F_FIXED_FILE))
cf3040ca 4090 return -EBADF;
03b1230c 4091
ec65fea5
PB
4092 /* open.how should be already initialised */
4093 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
08a1d26e 4094 req->open.how.flags |= O_LARGEFILE;
3529d8c2 4095
25e72d10
PB
4096 req->open.dfd = READ_ONCE(sqe->fd);
4097 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
f8748881 4098 req->open.filename = getname(fname);
15b71abe
JA
4099 if (IS_ERR(req->open.filename)) {
4100 ret = PTR_ERR(req->open.filename);
4101 req->open.filename = NULL;
4102 return ret;
4103 }
b9445598
PB
4104
4105 req->open.file_slot = READ_ONCE(sqe->file_index);
4106 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4107 return -EINVAL;
4108
4022e7af 4109 req->open.nofile = rlimit(RLIMIT_NOFILE);
8fef80bf 4110 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 4111 return 0;
03b1230c
JA
4112}
4113
ec65fea5
PB
4114static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4115{
d3fddf6d
PB
4116 u64 mode = READ_ONCE(sqe->len);
4117 u64 flags = READ_ONCE(sqe->open_flags);
ec65fea5 4118
ec65fea5
PB
4119 req->open.how = build_open_how(flags, mode);
4120 return __io_openat_prep(req, sqe);
4121}
4122
cebdb986 4123static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 4124{
cebdb986 4125 struct open_how __user *how;
cebdb986 4126 size_t len;
0fa03c62
JA
4127 int ret;
4128
cebdb986
JA
4129 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4130 len = READ_ONCE(sqe->len);
cebdb986
JA
4131 if (len < OPEN_HOW_SIZE_VER0)
4132 return -EINVAL;
3529d8c2 4133
cebdb986
JA
4134 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4135 len);
4136 if (ret)
4137 return ret;
3529d8c2 4138
ec65fea5 4139 return __io_openat_prep(req, sqe);
cebdb986
JA
4140}
4141
45d189c6 4142static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
15b71abe
JA
4143{
4144 struct open_flags op;
15b71abe 4145 struct file *file;
b9445598
PB
4146 bool resolve_nonblock, nonblock_set;
4147 bool fixed = !!req->open.file_slot;
15b71abe
JA
4148 int ret;
4149
cebdb986 4150 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
4151 if (ret)
4152 goto err;
3a81fd02
JA
4153 nonblock_set = op.open_flag & O_NONBLOCK;
4154 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
45d189c6 4155 if (issue_flags & IO_URING_F_NONBLOCK) {
3a81fd02
JA
4156 /*
4157 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4158 * it'll always -EAGAIN
4159 */
4160 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4161 return -EAGAIN;
4162 op.lookup_flags |= LOOKUP_CACHED;
4163 op.open_flag |= O_NONBLOCK;
4164 }
15b71abe 4165
b9445598
PB
4166 if (!fixed) {
4167 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4168 if (ret < 0)
4169 goto err;
4170 }
15b71abe
JA
4171
4172 file = do_filp_open(req->open.dfd, req->open.filename, &op);
12dcb58a 4173 if (IS_ERR(file)) {
944d1444 4174 /*
12dcb58a
PB
4175 * We could hang on to this 'fd' on retrying, but seems like
4176 * marginal gain for something that is now known to be a slower
4177 * path. So just put it, and we'll get a new one when we retry.
944d1444 4178 */
b9445598
PB
4179 if (!fixed)
4180 put_unused_fd(ret);
3a81fd02 4181
15b71abe 4182 ret = PTR_ERR(file);
12dcb58a
PB
4183 /* only retry if RESOLVE_CACHED wasn't already set by application */
4184 if (ret == -EAGAIN &&
4185 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4186 return -EAGAIN;
4187 goto err;
15b71abe 4188 }
12dcb58a
PB
4189
4190 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4191 file->f_flags &= ~O_NONBLOCK;
4192 fsnotify_open(file);
b9445598
PB
4193
4194 if (!fixed)
4195 fd_install(ret, file);
4196 else
4197 ret = io_install_fixed_file(req, file, issue_flags,
4198 req->open.file_slot - 1);
15b71abe
JA
4199err:
4200 putname(req->open.filename);
8fef80bf 4201 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe 4202 if (ret < 0)
93d2bcd2 4203 req_set_fail(req);
0bdf3398 4204 __io_req_complete(req, issue_flags, ret, 0);
15b71abe
JA
4205 return 0;
4206}
4207
45d189c6 4208static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
cebdb986 4209{
e45cff58 4210 return io_openat2(req, issue_flags);
cebdb986
JA
4211}
4212
067524e9
JA
4213static int io_remove_buffers_prep(struct io_kiocb *req,
4214 const struct io_uring_sqe *sqe)
4215{
4216 struct io_provide_buf *p = &req->pbuf;
4217 u64 tmp;
4218
26578cda
PB
4219 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4220 sqe->splice_fd_in)
067524e9
JA
4221 return -EINVAL;
4222
4223 tmp = READ_ONCE(sqe->fd);
4224 if (!tmp || tmp > USHRT_MAX)
4225 return -EINVAL;
4226
4227 memset(p, 0, sizeof(*p));
4228 p->nbufs = tmp;
4229 p->bgid = READ_ONCE(sqe->buf_group);
4230 return 0;
4231}
4232
4233static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4234 int bgid, unsigned nbufs)
4235{
4236 unsigned i = 0;
4237
4238 /* shouldn't happen */
4239 if (!nbufs)
4240 return 0;
4241
4242 /* the head kbuf is the list itself */
4243 while (!list_empty(&buf->list)) {
4244 struct io_buffer *nxt;
4245
4246 nxt = list_first_entry(&buf->list, struct io_buffer, list);
4247 list_del(&nxt->list);
4248 kfree(nxt);
4249 if (++i == nbufs)
4250 return i;
4251 }
4252 i++;
4253 kfree(buf);
9e15c3a0 4254 xa_erase(&ctx->io_buffers, bgid);
067524e9
JA
4255
4256 return i;
4257}
4258
889fca73 4259static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
067524e9
JA
4260{
4261 struct io_provide_buf *p = &req->pbuf;
4262 struct io_ring_ctx *ctx = req->ctx;
4263 struct io_buffer *head;
4264 int ret = 0;
45d189c6 4265 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
067524e9
JA
4266
4267 io_ring_submit_lock(ctx, !force_nonblock);
4268
4269 lockdep_assert_held(&ctx->uring_lock);
4270
4271 ret = -ENOENT;
9e15c3a0 4272 head = xa_load(&ctx->io_buffers, p->bgid);
067524e9
JA
4273 if (head)
4274 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
067524e9 4275 if (ret < 0)
93d2bcd2 4276 req_set_fail(req);
067524e9 4277
9fb8cb49
PB
4278 /* complete before unlock, IOPOLL may need the lock */
4279 __io_req_complete(req, issue_flags, ret, 0);
4280 io_ring_submit_unlock(ctx, !force_nonblock);
067524e9
JA
4281 return 0;
4282}
4283
ddf0322d
JA
4284static int io_provide_buffers_prep(struct io_kiocb *req,
4285 const struct io_uring_sqe *sqe)
4286{
38134ada 4287 unsigned long size, tmp_check;
ddf0322d
JA
4288 struct io_provide_buf *p = &req->pbuf;
4289 u64 tmp;
4290
26578cda 4291 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
ddf0322d
JA
4292 return -EINVAL;
4293
4294 tmp = READ_ONCE(sqe->fd);
4295 if (!tmp || tmp > USHRT_MAX)
4296 return -E2BIG;
4297 p->nbufs = tmp;
4298 p->addr = READ_ONCE(sqe->addr);
4299 p->len = READ_ONCE(sqe->len);
4300
38134ada
PB
4301 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4302 &size))
4303 return -EOVERFLOW;
4304 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4305 return -EOVERFLOW;
4306
d81269fe
PB
4307 size = (unsigned long)p->len * p->nbufs;
4308 if (!access_ok(u64_to_user_ptr(p->addr), size))
ddf0322d
JA
4309 return -EFAULT;
4310
4311 p->bgid = READ_ONCE(sqe->buf_group);
4312 tmp = READ_ONCE(sqe->off);
4313 if (tmp > USHRT_MAX)
4314 return -E2BIG;
4315 p->bid = tmp;
4316 return 0;
4317}
4318
4319static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4320{
4321 struct io_buffer *buf;
4322 u64 addr = pbuf->addr;
4323 int i, bid = pbuf->bid;
4324
4325 for (i = 0; i < pbuf->nbufs; i++) {
9990da93 4326 buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
ddf0322d
JA
4327 if (!buf)
4328 break;
4329
4330 buf->addr = addr;
d1f82808 4331 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
ddf0322d
JA
4332 buf->bid = bid;
4333 addr += pbuf->len;
4334 bid++;
4335 if (!*head) {
4336 INIT_LIST_HEAD(&buf->list);
4337 *head = buf;
4338 } else {
4339 list_add_tail(&buf->list, &(*head)->list);
4340 }
4341 }
4342
4343 return i ? i : -ENOMEM;
4344}
4345
889fca73 4346static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
ddf0322d
JA
4347{
4348 struct io_provide_buf *p = &req->pbuf;
4349 struct io_ring_ctx *ctx = req->ctx;
4350 struct io_buffer *head, *list;
4351 int ret = 0;
45d189c6 4352 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ddf0322d
JA
4353
4354 io_ring_submit_lock(ctx, !force_nonblock);
4355
4356 lockdep_assert_held(&ctx->uring_lock);
4357
9e15c3a0 4358 list = head = xa_load(&ctx->io_buffers, p->bgid);
ddf0322d
JA
4359
4360 ret = io_add_buffers(p, &head);
9e15c3a0
JA
4361 if (ret >= 0 && !list) {
4362 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4363 if (ret < 0)
067524e9 4364 __io_remove_buffers(ctx, head, p->bgid, -1U);
ddf0322d 4365 }
ddf0322d 4366 if (ret < 0)
93d2bcd2 4367 req_set_fail(req);
9fb8cb49
PB
4368 /* complete before unlock, IOPOLL may need the lock */
4369 __io_req_complete(req, issue_flags, ret, 0);
4370 io_ring_submit_unlock(ctx, !force_nonblock);
ddf0322d 4371 return 0;
cebdb986
JA
4372}
4373
3e4827b0
JA
4374static int io_epoll_ctl_prep(struct io_kiocb *req,
4375 const struct io_uring_sqe *sqe)
4376{
4377#if defined(CONFIG_EPOLL)
26578cda 4378 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3e4827b0 4379 return -EINVAL;
2d74d042 4380 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4381 return -EINVAL;
3e4827b0
JA
4382
4383 req->epoll.epfd = READ_ONCE(sqe->fd);
4384 req->epoll.op = READ_ONCE(sqe->len);
4385 req->epoll.fd = READ_ONCE(sqe->off);
4386
4387 if (ep_op_has_event(req->epoll.op)) {
4388 struct epoll_event __user *ev;
4389
4390 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4391 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4392 return -EFAULT;
4393 }
4394
4395 return 0;
4396#else
4397 return -EOPNOTSUPP;
4398#endif
4399}
4400
889fca73 4401static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
3e4827b0
JA
4402{
4403#if defined(CONFIG_EPOLL)
4404 struct io_epoll *ie = &req->epoll;
4405 int ret;
45d189c6 4406 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3e4827b0
JA
4407
4408 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4409 if (force_nonblock && ret == -EAGAIN)
4410 return -EAGAIN;
4411
4412 if (ret < 0)
93d2bcd2 4413 req_set_fail(req);
889fca73 4414 __io_req_complete(req, issue_flags, ret, 0);
3e4827b0
JA
4415 return 0;
4416#else
4417 return -EOPNOTSUPP;
4418#endif
4419}
4420
c1ca757b
JA
4421static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4422{
4423#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
26578cda 4424 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
c1ca757b 4425 return -EINVAL;
3232dd02
PB
4426 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4427 return -EINVAL;
c1ca757b
JA
4428
4429 req->madvise.addr = READ_ONCE(sqe->addr);
4430 req->madvise.len = READ_ONCE(sqe->len);
4431 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4432 return 0;
4433#else
4434 return -EOPNOTSUPP;
4435#endif
4436}
4437
45d189c6 4438static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
c1ca757b
JA
4439{
4440#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4441 struct io_madvise *ma = &req->madvise;
4442 int ret;
4443
45d189c6 4444 if (issue_flags & IO_URING_F_NONBLOCK)
c1ca757b
JA
4445 return -EAGAIN;
4446
0726b01e 4447 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
c1ca757b 4448 if (ret < 0)
93d2bcd2 4449 req_set_fail(req);
e1e16097 4450 io_req_complete(req, ret);
c1ca757b
JA
4451 return 0;
4452#else
4453 return -EOPNOTSUPP;
4454#endif
4455}
4456
4840e418
JA
4457static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4458{
26578cda 4459 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4840e418 4460 return -EINVAL;
3232dd02
PB
4461 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4462 return -EINVAL;
4840e418
JA
4463
4464 req->fadvise.offset = READ_ONCE(sqe->off);
4465 req->fadvise.len = READ_ONCE(sqe->len);
4466 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4467 return 0;
4468}
4469
45d189c6 4470static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4840e418
JA
4471{
4472 struct io_fadvise *fa = &req->fadvise;
4473 int ret;
4474
45d189c6 4475 if (issue_flags & IO_URING_F_NONBLOCK) {
3e69426d
JA
4476 switch (fa->advice) {
4477 case POSIX_FADV_NORMAL:
4478 case POSIX_FADV_RANDOM:
4479 case POSIX_FADV_SEQUENTIAL:
4480 break;
4481 default:
4482 return -EAGAIN;
4483 }
4484 }
4840e418
JA
4485
4486 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4487 if (ret < 0)
93d2bcd2 4488 req_set_fail(req);
0bdf3398 4489 __io_req_complete(req, issue_flags, ret, 0);
4840e418
JA
4490 return 0;
4491}
4492
eddc7ef5
JA
4493static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4494{
2d74d042 4495 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4496 return -EINVAL;
26578cda 4497 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
eddc7ef5 4498 return -EINVAL;
9c280f90 4499 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4500 return -EBADF;
eddc7ef5 4501
1d9e1288
BM
4502 req->statx.dfd = READ_ONCE(sqe->fd);
4503 req->statx.mask = READ_ONCE(sqe->len);
e62753e4 4504 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
1d9e1288
BM
4505 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4506 req->statx.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5
JA
4507
4508 return 0;
4509}
4510
45d189c6 4511static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
eddc7ef5 4512{
1d9e1288 4513 struct io_statx *ctx = &req->statx;
eddc7ef5
JA
4514 int ret;
4515
59d70013 4516 if (issue_flags & IO_URING_F_NONBLOCK)
eddc7ef5
JA
4517 return -EAGAIN;
4518
e62753e4
BM
4519 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4520 ctx->buffer);
eddc7ef5 4521
eddc7ef5 4522 if (ret < 0)
93d2bcd2 4523 req_set_fail(req);
e1e16097 4524 io_req_complete(req, ret);
eddc7ef5
JA
4525 return 0;
4526}
4527
b5dba59e
JA
4528static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4529{
14587a46 4530 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4531 return -EINVAL;
b5dba59e 4532 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
7df778be 4533 sqe->rw_flags || sqe->buf_index)
b5dba59e 4534 return -EINVAL;
9c280f90 4535 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4536 return -EBADF;
b5dba59e
JA
4537
4538 req->close.fd = READ_ONCE(sqe->fd);
7df778be
PB
4539 req->close.file_slot = READ_ONCE(sqe->file_index);
4540 if (req->close.file_slot && req->close.fd)
4541 return -EINVAL;
4542
b5dba59e 4543 return 0;
b5dba59e
JA
4544}
4545
889fca73 4546static int io_close(struct io_kiocb *req, unsigned int issue_flags)
b5dba59e 4547{
9eac1904 4548 struct files_struct *files = current->files;
3af73b28 4549 struct io_close *close = &req->close;
9eac1904 4550 struct fdtable *fdt;
a1fde923
PB
4551 struct file *file = NULL;
4552 int ret = -EBADF;
b5dba59e 4553
7df778be
PB
4554 if (req->close.file_slot) {
4555 ret = io_close_fixed(req, issue_flags);
4556 goto err;
4557 }
4558
9eac1904
JA
4559 spin_lock(&files->file_lock);
4560 fdt = files_fdtable(files);
4561 if (close->fd >= fdt->max_fds) {
4562 spin_unlock(&files->file_lock);
4563 goto err;
4564 }
4565 file = fdt->fd[close->fd];
a1fde923 4566 if (!file || file->f_op == &io_uring_fops) {
9eac1904
JA
4567 spin_unlock(&files->file_lock);
4568 file = NULL;
4569 goto err;
3af73b28 4570 }
b5dba59e
JA
4571
4572 /* if the file has a flush method, be safe and punt to async */
45d189c6 4573 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
9eac1904 4574 spin_unlock(&files->file_lock);
0bf0eefd 4575 return -EAGAIN;
a2100672 4576 }
b5dba59e 4577
9eac1904
JA
4578 ret = __close_fd_get_file(close->fd, &file);
4579 spin_unlock(&files->file_lock);
4580 if (ret < 0) {
4581 if (ret == -ENOENT)
4582 ret = -EBADF;
4583 goto err;
4584 }
4585
3af73b28 4586 /* No ->flush() or already async, safely close from here */
9eac1904
JA
4587 ret = filp_close(file, current->files);
4588err:
3af73b28 4589 if (ret < 0)
93d2bcd2 4590 req_set_fail(req);
9eac1904
JA
4591 if (file)
4592 fput(file);
889fca73 4593 __io_req_complete(req, issue_flags, ret, 0);
1a417f4e 4594 return 0;
b5dba59e
JA
4595}
4596
1155c76a 4597static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
4598{
4599 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4 4600
5d17b4a4
JA
4601 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4602 return -EINVAL;
26578cda
PB
4603 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4604 sqe->splice_fd_in))
5d17b4a4
JA
4605 return -EINVAL;
4606
8ed8d3c3
JA
4607 req->sync.off = READ_ONCE(sqe->off);
4608 req->sync.len = READ_ONCE(sqe->len);
4609 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
4610 return 0;
4611}
4612
45d189c6 4613static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4614{
8ed8d3c3
JA
4615 int ret;
4616
ac45abc0 4617 /* sync_file_range always requires a blocking context */
45d189c6 4618 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4619 return -EAGAIN;
4620
9adbd45d 4621 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
4622 req->sync.flags);
4623 if (ret < 0)
93d2bcd2 4624 req_set_fail(req);
e1e16097 4625 io_req_complete(req, ret);
5d17b4a4
JA
4626 return 0;
4627}
4628
469956e8 4629#if defined(CONFIG_NET)
02d27d89
PB
4630static int io_setup_async_msg(struct io_kiocb *req,
4631 struct io_async_msghdr *kmsg)
4632{
e8c2bc1f
JA
4633 struct io_async_msghdr *async_msg = req->async_data;
4634
4635 if (async_msg)
02d27d89 4636 return -EAGAIN;
e8c2bc1f 4637 if (io_alloc_async_data(req)) {
257e84a5 4638 kfree(kmsg->free_iov);
02d27d89
PB
4639 return -ENOMEM;
4640 }
e8c2bc1f 4641 async_msg = req->async_data;
02d27d89 4642 req->flags |= REQ_F_NEED_CLEANUP;
e8c2bc1f 4643 memcpy(async_msg, kmsg, sizeof(*kmsg));
2a780802 4644 async_msg->msg.msg_name = &async_msg->addr;
257e84a5
PB
4645 /* if were using fast_iov, set it to the new one */
4646 if (!async_msg->free_iov)
4647 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4648
02d27d89
PB
4649 return -EAGAIN;
4650}
4651
2ae523ed
PB
4652static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4653 struct io_async_msghdr *iomsg)
4654{
2ae523ed 4655 iomsg->msg.msg_name = &iomsg->addr;
257e84a5 4656 iomsg->free_iov = iomsg->fast_iov;
2ae523ed 4657 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
257e84a5 4658 req->sr_msg.msg_flags, &iomsg->free_iov);
2ae523ed
PB
4659}
4660
93642ef8
PB
4661static int io_sendmsg_prep_async(struct io_kiocb *req)
4662{
4663 int ret;
4664
93642ef8
PB
4665 ret = io_sendmsg_copy_hdr(req, req->async_data);
4666 if (!ret)
4667 req->flags |= REQ_F_NEED_CLEANUP;
4668 return ret;
4669}
4670
3529d8c2 4671static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 4672{
e47293fd 4673 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 4674
d2b6f48b
PB
4675 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4676 return -EINVAL;
4677
270a5940 4678 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 4679 sr->len = READ_ONCE(sqe->len);
04411806
PB
4680 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4681 if (sr->msg_flags & MSG_DONTWAIT)
4682 req->flags |= REQ_F_NOWAIT;
3529d8c2 4683
d8768362
JA
4684#ifdef CONFIG_COMPAT
4685 if (req->ctx->compat)
4686 sr->msg_flags |= MSG_CMSG_COMPAT;
4687#endif
93642ef8 4688 return 0;
03b1230c
JA
4689}
4690
889fca73 4691static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4692{
6b754c8b 4693 struct io_async_msghdr iomsg, *kmsg;
0fa03c62 4694 struct socket *sock;
7a7cacba 4695 unsigned flags;
0031275d 4696 int min_ret = 0;
0fa03c62
JA
4697 int ret;
4698
dba4a925 4699 sock = sock_from_file(req->file);
7a7cacba 4700 if (unlikely(!sock))
dba4a925 4701 return -ENOTSOCK;
3529d8c2 4702
d886e185
PB
4703 if (req_has_async_data(req)) {
4704 kmsg = req->async_data;
4705 } else {
7a7cacba
PB
4706 ret = io_sendmsg_copy_hdr(req, &iomsg);
4707 if (ret)
4708 return ret;
4709 kmsg = &iomsg;
0fa03c62 4710 }
0fa03c62 4711
04411806
PB
4712 flags = req->sr_msg.msg_flags;
4713 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4714 flags |= MSG_DONTWAIT;
0031275d
SM
4715 if (flags & MSG_WAITALL)
4716 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4717
7a7cacba 4718 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
45d189c6 4719 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4720 return io_setup_async_msg(req, kmsg);
4721 if (ret == -ERESTARTSYS)
4722 ret = -EINTR;
0fa03c62 4723
257e84a5
PB
4724 /* fast path, check for non-NULL to avoid function call */
4725 if (kmsg->free_iov)
4726 kfree(kmsg->free_iov);
99bc4c38 4727 req->flags &= ~REQ_F_NEED_CLEANUP;
0031275d 4728 if (ret < min_ret)
93d2bcd2 4729 req_set_fail(req);
889fca73 4730 __io_req_complete(req, issue_flags, ret, 0);
5d17b4a4 4731 return 0;
03b1230c 4732}
aa1fa28f 4733
889fca73 4734static int io_send(struct io_kiocb *req, unsigned int issue_flags)
fddaface 4735{
7a7cacba
PB
4736 struct io_sr_msg *sr = &req->sr_msg;
4737 struct msghdr msg;
4738 struct iovec iov;
fddaface 4739 struct socket *sock;
7a7cacba 4740 unsigned flags;
0031275d 4741 int min_ret = 0;
fddaface
JA
4742 int ret;
4743
dba4a925 4744 sock = sock_from_file(req->file);
7a7cacba 4745 if (unlikely(!sock))
dba4a925 4746 return -ENOTSOCK;
fddaface 4747
7a7cacba
PB
4748 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4749 if (unlikely(ret))
14db8411 4750 return ret;
fddaface 4751
7a7cacba
PB
4752 msg.msg_name = NULL;
4753 msg.msg_control = NULL;
4754 msg.msg_controllen = 0;
4755 msg.msg_namelen = 0;
fddaface 4756
04411806
PB
4757 flags = req->sr_msg.msg_flags;
4758 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4759 flags |= MSG_DONTWAIT;
0031275d
SM
4760 if (flags & MSG_WAITALL)
4761 min_ret = iov_iter_count(&msg.msg_iter);
4762
7a7cacba
PB
4763 msg.msg_flags = flags;
4764 ret = sock_sendmsg(sock, &msg);
45d189c6 4765 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4766 return -EAGAIN;
4767 if (ret == -ERESTARTSYS)
4768 ret = -EINTR;
fddaface 4769
0031275d 4770 if (ret < min_ret)
93d2bcd2 4771 req_set_fail(req);
889fca73 4772 __io_req_complete(req, issue_flags, ret, 0);
fddaface 4773 return 0;
fddaface
JA
4774}
4775
1400e697
PB
4776static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4777 struct io_async_msghdr *iomsg)
52de1fe1
JA
4778{
4779 struct io_sr_msg *sr = &req->sr_msg;
4780 struct iovec __user *uiov;
4781 size_t iov_len;
4782 int ret;
4783
1400e697
PB
4784 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4785 &iomsg->uaddr, &uiov, &iov_len);
52de1fe1
JA
4786 if (ret)
4787 return ret;
4788
4789 if (req->flags & REQ_F_BUFFER_SELECT) {
4790 if (iov_len > 1)
4791 return -EINVAL;
5476dfed 4792 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
52de1fe1 4793 return -EFAULT;
5476dfed 4794 sr->len = iomsg->fast_iov[0].iov_len;
257e84a5 4795 iomsg->free_iov = NULL;
52de1fe1 4796 } else {
257e84a5 4797 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4798 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
257e84a5 4799 &iomsg->free_iov, &iomsg->msg.msg_iter,
89cd35c5 4800 false);
52de1fe1
JA
4801 if (ret > 0)
4802 ret = 0;
4803 }
4804
4805 return ret;
4806}
4807
4808#ifdef CONFIG_COMPAT
4809static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
1400e697 4810 struct io_async_msghdr *iomsg)
52de1fe1 4811{
52de1fe1
JA
4812 struct io_sr_msg *sr = &req->sr_msg;
4813 struct compat_iovec __user *uiov;
4814 compat_uptr_t ptr;
4815 compat_size_t len;
4816 int ret;
4817
4af3417a
PB
4818 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4819 &ptr, &len);
52de1fe1
JA
4820 if (ret)
4821 return ret;
4822
4823 uiov = compat_ptr(ptr);
4824 if (req->flags & REQ_F_BUFFER_SELECT) {
4825 compat_ssize_t clen;
4826
4827 if (len > 1)
4828 return -EINVAL;
4829 if (!access_ok(uiov, sizeof(*uiov)))
4830 return -EFAULT;
4831 if (__get_user(clen, &uiov->iov_len))
4832 return -EFAULT;
4833 if (clen < 0)
4834 return -EINVAL;
2d280bc8 4835 sr->len = clen;
257e84a5 4836 iomsg->free_iov = NULL;
52de1fe1 4837 } else {
257e84a5 4838 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4839 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
257e84a5 4840 UIO_FASTIOV, &iomsg->free_iov,
89cd35c5 4841 &iomsg->msg.msg_iter, true);
52de1fe1
JA
4842 if (ret < 0)
4843 return ret;
4844 }
4845
4846 return 0;
4847}
4848#endif
4849
1400e697
PB
4850static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4851 struct io_async_msghdr *iomsg)
52de1fe1 4852{
1400e697 4853 iomsg->msg.msg_name = &iomsg->addr;
52de1fe1
JA
4854
4855#ifdef CONFIG_COMPAT
4856 if (req->ctx->compat)
1400e697 4857 return __io_compat_recvmsg_copy_hdr(req, iomsg);
fddaface 4858#endif
52de1fe1 4859
1400e697 4860 return __io_recvmsg_copy_hdr(req, iomsg);
52de1fe1
JA
4861}
4862
bcda7baa 4863static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
7fbb1b54 4864 bool needs_lock)
bcda7baa
JA
4865{
4866 struct io_sr_msg *sr = &req->sr_msg;
bcda7baa 4867
30d51dd4 4868 return io_buffer_select(req, &sr->len, sr->bgid, needs_lock);
fddaface
JA
4869}
4870
7fbb1b54
PB
4871static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4872{
30d51dd4 4873 return io_put_kbuf(req, req->kbuf);
7fbb1b54
PB
4874}
4875
93642ef8 4876static int io_recvmsg_prep_async(struct io_kiocb *req)
aa1fa28f 4877{
99bc4c38 4878 int ret;
3529d8c2 4879
93642ef8
PB
4880 ret = io_recvmsg_copy_hdr(req, req->async_data);
4881 if (!ret)
4882 req->flags |= REQ_F_NEED_CLEANUP;
4883 return ret;
4884}
4885
4886static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4887{
4888 struct io_sr_msg *sr = &req->sr_msg;
4889
d2b6f48b
PB
4890 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4891 return -EINVAL;
4892
270a5940 4893 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 4894 sr->len = READ_ONCE(sqe->len);
bcda7baa 4895 sr->bgid = READ_ONCE(sqe->buf_group);
04411806
PB
4896 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4897 if (sr->msg_flags & MSG_DONTWAIT)
4898 req->flags |= REQ_F_NOWAIT;
06b76d44 4899
d8768362
JA
4900#ifdef CONFIG_COMPAT
4901 if (req->ctx->compat)
4902 sr->msg_flags |= MSG_CMSG_COMPAT;
4903#endif
93642ef8 4904 return 0;
aa1fa28f
JA
4905}
4906
889fca73 4907static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4908{
6b754c8b 4909 struct io_async_msghdr iomsg, *kmsg;
03b1230c 4910 struct socket *sock;
7fbb1b54 4911 struct io_buffer *kbuf;
7a7cacba 4912 unsigned flags;
0031275d 4913 int min_ret = 0;
52de1fe1 4914 int ret, cflags = 0;
45d189c6 4915 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
03b1230c 4916
dba4a925 4917 sock = sock_from_file(req->file);
7a7cacba 4918 if (unlikely(!sock))
dba4a925 4919 return -ENOTSOCK;
3529d8c2 4920
d886e185
PB
4921 if (req_has_async_data(req)) {
4922 kmsg = req->async_data;
4923 } else {
7a7cacba
PB
4924 ret = io_recvmsg_copy_hdr(req, &iomsg);
4925 if (ret)
681fda8d 4926 return ret;
7a7cacba
PB
4927 kmsg = &iomsg;
4928 }
03b1230c 4929
bc02ef33 4930 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 4931 kbuf = io_recv_buffer_select(req, !force_nonblock);
bc02ef33 4932 if (IS_ERR(kbuf))
52de1fe1 4933 return PTR_ERR(kbuf);
7a7cacba 4934 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5476dfed
PB
4935 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4936 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
7a7cacba
PB
4937 1, req->sr_msg.len);
4938 }
52de1fe1 4939
04411806
PB
4940 flags = req->sr_msg.msg_flags;
4941 if (force_nonblock)
7a7cacba 4942 flags |= MSG_DONTWAIT;
0031275d
SM
4943 if (flags & MSG_WAITALL)
4944 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4945
7a7cacba
PB
4946 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4947 kmsg->uaddr, flags);
0e1b6fe3
PB
4948 if (force_nonblock && ret == -EAGAIN)
4949 return io_setup_async_msg(req, kmsg);
7a7cacba
PB
4950 if (ret == -ERESTARTSYS)
4951 ret = -EINTR;
03b1230c 4952
7fbb1b54
PB
4953 if (req->flags & REQ_F_BUFFER_SELECTED)
4954 cflags = io_put_recv_kbuf(req);
257e84a5
PB
4955 /* fast path, check for non-NULL to avoid function call */
4956 if (kmsg->free_iov)
4957 kfree(kmsg->free_iov);
99bc4c38 4958 req->flags &= ~REQ_F_NEED_CLEANUP;
0031275d 4959 if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
93d2bcd2 4960 req_set_fail(req);
889fca73 4961 __io_req_complete(req, issue_flags, ret, cflags);
03b1230c 4962 return 0;
0fa03c62 4963}
5d17b4a4 4964
889fca73 4965static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
fddaface 4966{
6b754c8b 4967 struct io_buffer *kbuf;
7a7cacba
PB
4968 struct io_sr_msg *sr = &req->sr_msg;
4969 struct msghdr msg;
4970 void __user *buf = sr->buf;
fddaface 4971 struct socket *sock;
7a7cacba
PB
4972 struct iovec iov;
4973 unsigned flags;
0031275d 4974 int min_ret = 0;
bcda7baa 4975 int ret, cflags = 0;
45d189c6 4976 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
fddaface 4977
dba4a925 4978 sock = sock_from_file(req->file);
7a7cacba 4979 if (unlikely(!sock))
dba4a925 4980 return -ENOTSOCK;
fddaface 4981
bc02ef33 4982 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 4983 kbuf = io_recv_buffer_select(req, !force_nonblock);
bcda7baa
JA
4984 if (IS_ERR(kbuf))
4985 return PTR_ERR(kbuf);
7a7cacba 4986 buf = u64_to_user_ptr(kbuf->addr);
bc02ef33 4987 }
bcda7baa 4988
7a7cacba 4989 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
14c32eee
PB
4990 if (unlikely(ret))
4991 goto out_free;
fddaface 4992
7a7cacba
PB
4993 msg.msg_name = NULL;
4994 msg.msg_control = NULL;
4995 msg.msg_controllen = 0;
4996 msg.msg_namelen = 0;
4997 msg.msg_iocb = NULL;
4998 msg.msg_flags = 0;
fddaface 4999
04411806
PB
5000 flags = req->sr_msg.msg_flags;
5001 if (force_nonblock)
7a7cacba 5002 flags |= MSG_DONTWAIT;
0031275d
SM
5003 if (flags & MSG_WAITALL)
5004 min_ret = iov_iter_count(&msg.msg_iter);
5005
7a7cacba
PB
5006 ret = sock_recvmsg(sock, &msg, flags);
5007 if (force_nonblock && ret == -EAGAIN)
5008 return -EAGAIN;
5009 if (ret == -ERESTARTSYS)
5010 ret = -EINTR;
14c32eee 5011out_free:
7fbb1b54
PB
5012 if (req->flags & REQ_F_BUFFER_SELECTED)
5013 cflags = io_put_recv_kbuf(req);
0031275d 5014 if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
93d2bcd2 5015 req_set_fail(req);
889fca73 5016 __io_req_complete(req, issue_flags, ret, cflags);
fddaface 5017 return 0;
fddaface
JA
5018}
5019
3529d8c2 5020static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35 5021{
8ed8d3c3
JA
5022 struct io_accept *accept = &req->accept;
5023
14587a46 5024 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
17f2fe35 5025 return -EINVAL;
aaa4db12 5026 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
5027 return -EINVAL;
5028
d55e5f5b
JA
5029 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5030 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 5031 accept->flags = READ_ONCE(sqe->accept_flags);
09952e3e 5032 accept->nofile = rlimit(RLIMIT_NOFILE);
a7083ad5 5033
aaa4db12
PB
5034 accept->file_slot = READ_ONCE(sqe->file_index);
5035 if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
5036 (accept->flags & SOCK_CLOEXEC)))
5037 return -EINVAL;
a7083ad5
PB
5038 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5039 return -EINVAL;
5040 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5041 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
8ed8d3c3 5042 return 0;
8ed8d3c3 5043}
17f2fe35 5044
889fca73 5045static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3
JA
5046{
5047 struct io_accept *accept = &req->accept;
45d189c6 5048 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ac45abc0 5049 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
aaa4db12 5050 bool fixed = !!accept->file_slot;
a7083ad5
PB
5051 struct file *file;
5052 int ret, fd;
8ed8d3c3 5053
e697deed
JX
5054 if (req->file->f_flags & O_NONBLOCK)
5055 req->flags |= REQ_F_NOWAIT;
5056
aaa4db12
PB
5057 if (!fixed) {
5058 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5059 if (unlikely(fd < 0))
5060 return fd;
5061 }
a7083ad5
PB
5062 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5063 accept->flags);
5064 if (IS_ERR(file)) {
aaa4db12
PB
5065 if (!fixed)
5066 put_unused_fd(fd);
a7083ad5
PB
5067 ret = PTR_ERR(file);
5068 if (ret == -EAGAIN && force_nonblock)
5069 return -EAGAIN;
ac45abc0
PB
5070 if (ret == -ERESTARTSYS)
5071 ret = -EINTR;
93d2bcd2 5072 req_set_fail(req);
aaa4db12 5073 } else if (!fixed) {
a7083ad5
PB
5074 fd_install(fd, file);
5075 ret = fd;
aaa4db12
PB
5076 } else {
5077 ret = io_install_fixed_file(req, file, issue_flags,
5078 accept->file_slot - 1);
ac45abc0 5079 }
889fca73 5080 __io_req_complete(req, issue_flags, ret, 0);
17f2fe35 5081 return 0;
8ed8d3c3
JA
5082}
5083
93642ef8
PB
5084static int io_connect_prep_async(struct io_kiocb *req)
5085{
5086 struct io_async_connect *io = req->async_data;
5087 struct io_connect *conn = &req->connect;
5088
5089 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5090}
5091
3529d8c2 5092static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021 5093{
3529d8c2 5094 struct io_connect *conn = &req->connect;
f499a021 5095
14587a46 5096 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3fbb51c1 5097 return -EINVAL;
26578cda
PB
5098 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5099 sqe->splice_fd_in)
3fbb51c1
JA
5100 return -EINVAL;
5101
3529d8c2
JA
5102 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5103 conn->addr_len = READ_ONCE(sqe->addr2);
93642ef8 5104 return 0;
f499a021
JA
5105}
5106
889fca73 5107static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
f8e85cf2 5108{
e8c2bc1f 5109 struct io_async_connect __io, *io;
f8e85cf2 5110 unsigned file_flags;
3fbb51c1 5111 int ret;
45d189c6 5112 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
f8e85cf2 5113
d886e185 5114 if (req_has_async_data(req)) {
e8c2bc1f 5115 io = req->async_data;
f499a021 5116 } else {
3529d8c2
JA
5117 ret = move_addr_to_kernel(req->connect.addr,
5118 req->connect.addr_len,
e8c2bc1f 5119 &__io.address);
f499a021
JA
5120 if (ret)
5121 goto out;
5122 io = &__io;
5123 }
5124
3fbb51c1
JA
5125 file_flags = force_nonblock ? O_NONBLOCK : 0;
5126
e8c2bc1f 5127 ret = __sys_connect_file(req->file, &io->address,
3fbb51c1 5128 req->connect.addr_len, file_flags);
87f80d62 5129 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
d886e185 5130 if (req_has_async_data(req))
b7bb4f7d 5131 return -EAGAIN;
e8c2bc1f 5132 if (io_alloc_async_data(req)) {
f499a021
JA
5133 ret = -ENOMEM;
5134 goto out;
5135 }
e8c2bc1f 5136 memcpy(req->async_data, &__io, sizeof(__io));
f8e85cf2 5137 return -EAGAIN;
f499a021 5138 }
f8e85cf2
JA
5139 if (ret == -ERESTARTSYS)
5140 ret = -EINTR;
f499a021 5141out:
4e88d6e7 5142 if (ret < 0)
93d2bcd2 5143 req_set_fail(req);
889fca73 5144 __io_req_complete(req, issue_flags, ret, 0);
f8e85cf2 5145 return 0;
469956e8
Y
5146}
5147#else /* !CONFIG_NET */
99a10081
JA
5148#define IO_NETOP_FN(op) \
5149static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
5150{ \
5151 return -EOPNOTSUPP; \
5152}
5153
5154#define IO_NETOP_PREP(op) \
5155IO_NETOP_FN(op) \
5156static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5157{ \
5158 return -EOPNOTSUPP; \
5159} \
5160
5161#define IO_NETOP_PREP_ASYNC(op) \
5162IO_NETOP_PREP(op) \
5163static int io_##op##_prep_async(struct io_kiocb *req) \
5164{ \
5165 return -EOPNOTSUPP; \
5166}
5167
5168IO_NETOP_PREP_ASYNC(sendmsg);
5169IO_NETOP_PREP_ASYNC(recvmsg);
5170IO_NETOP_PREP_ASYNC(connect);
5171IO_NETOP_PREP(accept);
5172IO_NETOP_FN(send);
5173IO_NETOP_FN(recv);
469956e8 5174#endif /* CONFIG_NET */
f8e85cf2 5175
d7718a9d
JA
5176struct io_poll_table {
5177 struct poll_table_struct pt;
5178 struct io_kiocb *req;
68b11e8b 5179 int nr_entries;
d7718a9d
JA
5180 int error;
5181};
ce593a6c 5182
d7718a9d 5183static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
5b0a6acc 5184 __poll_t mask, io_req_tw_func_t func)
d7718a9d 5185{
d7718a9d
JA
5186 /* for instances that support it check for an event match first: */
5187 if (mask && !(mask & poll->events))
5188 return 0;
5189
5190 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
5191
5192 list_del_init(&poll->wait.entry);
5193
d7718a9d 5194 req->result = mask;
5b0a6acc 5195 req->io_task_work.func = func;
6d816e08 5196
d7718a9d 5197 /*
e3aabf95
JA
5198 * If this fails, then the task is exiting. When a task exits, the
5199 * work gets canceled, so just cancel this request as well instead
5200 * of executing it. We can't safely execute it anyway, as we may not
5201 * have the needed state needed for it anyway.
d7718a9d 5202 */
e09ee510 5203 io_req_task_work_add(req);
d7718a9d
JA
5204 return 1;
5205}
5206
74ce6ce4
JA
5207static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
5208 __acquires(&req->ctx->completion_lock)
5209{
5210 struct io_ring_ctx *ctx = req->ctx;
5211
316319e8 5212 /* req->task == current here, checking PF_EXITING is safe */
e09ee510
PB
5213 if (unlikely(req->task->flags & PF_EXITING))
5214 WRITE_ONCE(poll->canceled, true);
5215
74ce6ce4
JA
5216 if (!req->result && !READ_ONCE(poll->canceled)) {
5217 struct poll_table_struct pt = { ._key = poll->events };
5218
5219 req->result = vfs_poll(req->file, &pt) & poll->events;
5220 }
5221
79ebeaee 5222 spin_lock(&ctx->completion_lock);
74ce6ce4
JA
5223 if (!req->result && !READ_ONCE(poll->canceled)) {
5224 add_wait_queue(poll->head, &poll->wait);
5225 return true;
5226 }
5227
5228 return false;
5229}
5230
d4e7cd36 5231static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
18bceab1 5232{
e8c2bc1f 5233 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
d4e7cd36 5234 if (req->opcode == IORING_OP_POLL_ADD)
e8c2bc1f 5235 return req->async_data;
d4e7cd36
JA
5236 return req->apoll->double_poll;
5237}
5238
5239static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5240{
5241 if (req->opcode == IORING_OP_POLL_ADD)
5242 return &req->poll;
5243 return &req->apoll->poll;
5244}
5245
5246static void io_poll_remove_double(struct io_kiocb *req)
e07785b0 5247 __must_hold(&req->ctx->completion_lock)
d4e7cd36
JA
5248{
5249 struct io_poll_iocb *poll = io_poll_get_double(req);
18bceab1
JA
5250
5251 lockdep_assert_held(&req->ctx->completion_lock);
5252
5253 if (poll && poll->head) {
5254 struct wait_queue_head *head = poll->head;
5255
79ebeaee 5256 spin_lock_irq(&head->lock);
18bceab1
JA
5257 list_del_init(&poll->wait.entry);
5258 if (poll->wait.private)
de9b4cca 5259 req_ref_put(req);
18bceab1 5260 poll->head = NULL;
79ebeaee 5261 spin_unlock_irq(&head->lock);
18bceab1
JA
5262 }
5263}
5264
31efe48e 5265static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
e07785b0 5266 __must_hold(&req->ctx->completion_lock)
18bceab1
JA
5267{
5268 struct io_ring_ctx *ctx = req->ctx;
88e41cf9 5269 unsigned flags = IORING_CQE_F_MORE;
e27414be 5270 int error;
18bceab1 5271
e27414be 5272 if (READ_ONCE(req->poll.canceled)) {
45ab03b1 5273 error = -ECANCELED;
88e41cf9 5274 req->poll.events |= EPOLLONESHOT;
e27414be 5275 } else {
5082620f 5276 error = mangle_poll(mask);
e27414be 5277 }
b69de288
JA
5278 if (req->poll.events & EPOLLONESHOT)
5279 flags = 0;
a62682f9
HX
5280 if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
5281 req->poll.events |= EPOLLONESHOT;
88e41cf9 5282 flags = 0;
a62682f9 5283 }
7b289c38
HX
5284 if (flags & IORING_CQE_F_MORE)
5285 ctx->cq_extra++;
18bceab1 5286
88e41cf9 5287 return !(flags & IORING_CQE_F_MORE);
18bceab1
JA
5288}
5289
f237c30a 5290static void io_poll_task_func(struct io_kiocb *req, bool *locked)
18bceab1
JA
5291{
5292 struct io_ring_ctx *ctx = req->ctx;
dd221f46 5293 struct io_kiocb *nxt;
18bceab1
JA
5294
5295 if (io_poll_rewait(req, &req->poll)) {
79ebeaee 5296 spin_unlock(&ctx->completion_lock);
dd221f46 5297 } else {
f40b964a 5298 bool done;
18bceab1 5299
5b7aa38d
HX
5300 if (req->poll.done) {
5301 spin_unlock(&ctx->completion_lock);
5302 return;
5303 }
31efe48e 5304 done = __io_poll_complete(req, req->result);
88e41cf9 5305 if (done) {
a890d01e 5306 io_poll_remove_double(req);
88e41cf9 5307 hash_del(&req->hash_node);
bd99c71b 5308 req->poll.done = true;
f40b964a 5309 } else {
88e41cf9
JA
5310 req->result = 0;
5311 add_wait_queue(req->poll.head, &req->poll.wait);
5312 }
31efe48e 5313 io_commit_cqring(ctx);
79ebeaee 5314 spin_unlock(&ctx->completion_lock);
dd221f46 5315 io_cqring_ev_posted(ctx);
18bceab1 5316
88e41cf9
JA
5317 if (done) {
5318 nxt = io_put_req_find_next(req);
5319 if (nxt)
f237c30a 5320 io_req_task_submit(nxt, locked);
88e41cf9 5321 }
dd221f46 5322 }
18bceab1
JA
5323}
5324
5325static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
5326 int sync, void *key)
5327{
5328 struct io_kiocb *req = wait->private;
d4e7cd36 5329 struct io_poll_iocb *poll = io_poll_get_single(req);
18bceab1 5330 __poll_t mask = key_to_poll(key);
79ebeaee 5331 unsigned long flags;
18bceab1
JA
5332
5333 /* for instances that support it check for an event match first: */
5334 if (mask && !(mask & poll->events))
5335 return 0;
88e41cf9
JA
5336 if (!(poll->events & EPOLLONESHOT))
5337 return poll->wait.func(&poll->wait, mode, sync, key);
18bceab1 5338
8706e04e
JA
5339 list_del_init(&wait->entry);
5340
9ce85ef2 5341 if (poll->head) {
18bceab1
JA
5342 bool done;
5343
79ebeaee 5344 spin_lock_irqsave(&poll->head->lock, flags);
807abcb0 5345 done = list_empty(&poll->wait.entry);
18bceab1 5346 if (!done)
807abcb0 5347 list_del_init(&poll->wait.entry);
d4e7cd36
JA
5348 /* make sure double remove sees this as being gone */
5349 wait->private = NULL;
79ebeaee 5350 spin_unlock_irqrestore(&poll->head->lock, flags);
c8b5e260
JA
5351 if (!done) {
5352 /* use wait func handler, so it matches the rq type */
5353 poll->wait.func(&poll->wait, mode, sync, key);
5354 }
18bceab1 5355 }
de9b4cca 5356 req_ref_put(req);
18bceab1
JA
5357 return 1;
5358}
5359
5360static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5361 wait_queue_func_t wake_func)
5362{
5363 poll->head = NULL;
5364 poll->done = false;
5365 poll->canceled = false;
464dca61
JA
5366#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5367 /* mask in events that we always want/need */
5368 poll->events = events | IO_POLL_UNMASK;
18bceab1
JA
5369 INIT_LIST_HEAD(&poll->wait.entry);
5370 init_waitqueue_func_entry(&poll->wait, wake_func);
5371}
5372
5373static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
807abcb0
JA
5374 struct wait_queue_head *head,
5375 struct io_poll_iocb **poll_ptr)
18bceab1
JA
5376{
5377 struct io_kiocb *req = pt->req;
5378
5379 /*
68b11e8b
PB
5380 * The file being polled uses multiple waitqueues for poll handling
5381 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
5382 * if this happens.
18bceab1 5383 */
68b11e8b 5384 if (unlikely(pt->nr_entries)) {
58852d4d
PB
5385 struct io_poll_iocb *poll_one = poll;
5386
23a65db8
PB
5387 /* double add on the same waitqueue head, ignore */
5388 if (poll_one->head == head)
5389 return;
18bceab1 5390 /* already have a 2nd entry, fail a third attempt */
807abcb0 5391 if (*poll_ptr) {
23a65db8
PB
5392 if ((*poll_ptr)->head == head)
5393 return;
18bceab1
JA
5394 pt->error = -EINVAL;
5395 return;
5396 }
ea6a693d
JA
5397 /*
5398 * Can't handle multishot for double wait for now, turn it
5399 * into one-shot mode.
5400 */
7a274727
PB
5401 if (!(poll_one->events & EPOLLONESHOT))
5402 poll_one->events |= EPOLLONESHOT;
18bceab1
JA
5403 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5404 if (!poll) {
5405 pt->error = -ENOMEM;
5406 return;
5407 }
58852d4d 5408 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
de9b4cca 5409 req_ref_get(req);
18bceab1 5410 poll->wait.private = req;
d886e185 5411
807abcb0 5412 *poll_ptr = poll;
d886e185
PB
5413 if (req->opcode == IORING_OP_POLL_ADD)
5414 req->flags |= REQ_F_ASYNC_DATA;
18bceab1
JA
5415 }
5416
68b11e8b 5417 pt->nr_entries++;
18bceab1 5418 poll->head = head;
a31eb4a2
JX
5419
5420 if (poll->events & EPOLLEXCLUSIVE)
5421 add_wait_queue_exclusive(head, &poll->wait);
5422 else
5423 add_wait_queue(head, &poll->wait);
18bceab1
JA
5424}
5425
5426static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5427 struct poll_table_struct *p)
5428{
5429 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
807abcb0 5430 struct async_poll *apoll = pt->req->apoll;
18bceab1 5431
807abcb0 5432 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
18bceab1
JA
5433}
5434
f237c30a 5435static void io_async_task_func(struct io_kiocb *req, bool *locked)
d7718a9d 5436{
d7718a9d
JA
5437 struct async_poll *apoll = req->apoll;
5438 struct io_ring_ctx *ctx = req->ctx;
5439
236daeae 5440 trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
d7718a9d 5441
74ce6ce4 5442 if (io_poll_rewait(req, &apoll->poll)) {
79ebeaee 5443 spin_unlock(&ctx->completion_lock);
74ce6ce4 5444 return;
d7718a9d
JA
5445 }
5446
0ea13b44 5447 hash_del(&req->hash_node);
d4e7cd36 5448 io_poll_remove_double(req);
bd99c71b 5449 apoll->poll.done = true;
79ebeaee 5450 spin_unlock(&ctx->completion_lock);
74ce6ce4 5451
0be0b0e3 5452 if (!READ_ONCE(apoll->poll.canceled))
f237c30a 5453 io_req_task_submit(req, locked);
0be0b0e3 5454 else
2593553a 5455 io_req_complete_failed(req, -ECANCELED);
d7718a9d
JA
5456}
5457
5458static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5459 void *key)
5460{
5461 struct io_kiocb *req = wait->private;
5462 struct io_poll_iocb *poll = &req->apoll->poll;
5463
5464 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5465 key_to_poll(key));
5466
5467 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5468}
5469
5470static void io_poll_req_insert(struct io_kiocb *req)
5471{
5472 struct io_ring_ctx *ctx = req->ctx;
5473 struct hlist_head *list;
5474
5475 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5476 hlist_add_head(&req->hash_node, list);
5477}
5478
5479static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5480 struct io_poll_iocb *poll,
5481 struct io_poll_table *ipt, __poll_t mask,
5482 wait_queue_func_t wake_func)
5483 __acquires(&ctx->completion_lock)
5484{
5485 struct io_ring_ctx *ctx = req->ctx;
5486 bool cancel = false;
5487
4d52f338 5488 INIT_HLIST_NODE(&req->hash_node);
18bceab1 5489 io_init_poll_iocb(poll, mask, wake_func);
b90cd197 5490 poll->file = req->file;
18bceab1 5491 poll->wait.private = req;
d7718a9d
JA
5492
5493 ipt->pt._key = mask;
5494 ipt->req = req;
68b11e8b
PB
5495 ipt->error = 0;
5496 ipt->nr_entries = 0;
d7718a9d 5497
d7718a9d 5498 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
68b11e8b
PB
5499 if (unlikely(!ipt->nr_entries) && !ipt->error)
5500 ipt->error = -EINVAL;
d7718a9d 5501
79ebeaee 5502 spin_lock(&ctx->completion_lock);
a890d01e 5503 if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
46fee9ab 5504 io_poll_remove_double(req);
d7718a9d 5505 if (likely(poll->head)) {
79ebeaee 5506 spin_lock_irq(&poll->head->lock);
d7718a9d
JA
5507 if (unlikely(list_empty(&poll->wait.entry))) {
5508 if (ipt->error)
5509 cancel = true;
5510 ipt->error = 0;
5511 mask = 0;
5512 }
88e41cf9 5513 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
d7718a9d
JA
5514 list_del_init(&poll->wait.entry);
5515 else if (cancel)
5516 WRITE_ONCE(poll->canceled, true);
5517 else if (!poll->done) /* actually waiting for an event */
5518 io_poll_req_insert(req);
79ebeaee 5519 spin_unlock_irq(&poll->head->lock);
d7718a9d
JA
5520 }
5521
5522 return mask;
5523}
5524
59b735ae
OL
5525enum {
5526 IO_APOLL_OK,
5527 IO_APOLL_ABORTED,
5528 IO_APOLL_READY
5529};
5530
5531static int io_arm_poll_handler(struct io_kiocb *req)
d7718a9d
JA
5532{
5533 const struct io_op_def *def = &io_op_defs[req->opcode];
5534 struct io_ring_ctx *ctx = req->ctx;
5535 struct async_poll *apoll;
5536 struct io_poll_table ipt;
b2d9c3da 5537 __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
9dab14b8 5538 int rw;
d7718a9d
JA
5539
5540 if (!req->file || !file_can_poll(req->file))
59b735ae 5541 return IO_APOLL_ABORTED;
24c74678 5542 if (req->flags & REQ_F_POLLED)
59b735ae 5543 return IO_APOLL_ABORTED;
b2d9c3da
PB
5544 if (!def->pollin && !def->pollout)
5545 return IO_APOLL_ABORTED;
5546
5547 if (def->pollin) {
9dab14b8 5548 rw = READ;
b2d9c3da
PB
5549 mask |= POLLIN | POLLRDNORM;
5550
5551 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5552 if ((req->opcode == IORING_OP_RECVMSG) &&
5553 (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5554 mask &= ~POLLIN;
5555 } else {
9dab14b8 5556 rw = WRITE;
b2d9c3da
PB
5557 mask |= POLLOUT | POLLWRNORM;
5558 }
5559
9dab14b8 5560 /* if we can't nonblock try, then no point in arming a poll handler */
b191e2df 5561 if (!io_file_supports_nowait(req, rw))
59b735ae 5562 return IO_APOLL_ABORTED;
d7718a9d
JA
5563
5564 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5565 if (unlikely(!apoll))
59b735ae 5566 return IO_APOLL_ABORTED;
807abcb0 5567 apoll->double_poll = NULL;
d7718a9d 5568 req->apoll = apoll;
b2d9c3da 5569 req->flags |= REQ_F_POLLED;
d7718a9d 5570 ipt.pt._qproc = io_async_queue_proc;
48dcd38d 5571 io_req_set_refcount(req);
d7718a9d
JA
5572
5573 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5574 io_async_wake);
79ebeaee 5575 spin_unlock(&ctx->completion_lock);
41a5169c
HX
5576 if (ret || ipt.error)
5577 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
5578
236daeae
OL
5579 trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5580 mask, apoll->poll.events);
59b735ae 5581 return IO_APOLL_OK;
d7718a9d
JA
5582}
5583
5584static bool __io_poll_remove_one(struct io_kiocb *req,
b2e720ac 5585 struct io_poll_iocb *poll, bool do_cancel)
e07785b0 5586 __must_hold(&req->ctx->completion_lock)
221c5eb2 5587{
b41e9852 5588 bool do_complete = false;
221c5eb2 5589
5082620f
JA
5590 if (!poll->head)
5591 return false;
79ebeaee 5592 spin_lock_irq(&poll->head->lock);
b2e720ac
JA
5593 if (do_cancel)
5594 WRITE_ONCE(poll->canceled, true);
392edb45
JA
5595 if (!list_empty(&poll->wait.entry)) {
5596 list_del_init(&poll->wait.entry);
b41e9852 5597 do_complete = true;
221c5eb2 5598 }
79ebeaee 5599 spin_unlock_irq(&poll->head->lock);
3bfa5bcb 5600 hash_del(&req->hash_node);
d7718a9d
JA
5601 return do_complete;
5602}
5603
5d709043 5604static bool io_poll_remove_one(struct io_kiocb *req)
e07785b0 5605 __must_hold(&req->ctx->completion_lock)
d7718a9d
JA
5606{
5607 bool do_complete;
5608
d4e7cd36 5609 io_poll_remove_double(req);
e31001a3 5610 do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
d4e7cd36 5611
b41e9852 5612 if (do_complete) {
d4d19c19 5613 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
b41e9852 5614 io_commit_cqring(req->ctx);
93d2bcd2 5615 req_set_fail(req);
91c2f697 5616 io_put_req_deferred(req);
5d709043 5617 }
b41e9852 5618 return do_complete;
221c5eb2
JA
5619}
5620
76e1b642
JA
5621/*
5622 * Returns true if we found and killed one or more poll requests
5623 */
c072481d
PB
5624static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
5625 struct task_struct *tsk, bool cancel_all)
221c5eb2 5626{
78076bb6 5627 struct hlist_node *tmp;
221c5eb2 5628 struct io_kiocb *req;
8e2e1faf 5629 int posted = 0, i;
221c5eb2 5630
79ebeaee 5631 spin_lock(&ctx->completion_lock);
78076bb6
JA
5632 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5633 struct hlist_head *list;
5634
5635 list = &ctx->cancel_hash[i];
f3606e3a 5636 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
3dd0c97a 5637 if (io_match_task(req, tsk, cancel_all))
f3606e3a
JA
5638 posted += io_poll_remove_one(req);
5639 }
221c5eb2 5640 }
79ebeaee 5641 spin_unlock(&ctx->completion_lock);
b41e9852 5642
8e2e1faf
JA
5643 if (posted)
5644 io_cqring_ev_posted(ctx);
76e1b642
JA
5645
5646 return posted != 0;
221c5eb2
JA
5647}
5648
9ba5fac8
PB
5649static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5650 bool poll_only)
e07785b0 5651 __must_hold(&ctx->completion_lock)
47f46768 5652{
78076bb6 5653 struct hlist_head *list;
47f46768
JA
5654 struct io_kiocb *req;
5655
78076bb6
JA
5656 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5657 hlist_for_each_entry(req, list, hash_node) {
b41e9852
JA
5658 if (sqe_addr != req->user_data)
5659 continue;
9ba5fac8
PB
5660 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5661 continue;
b2cb805f 5662 return req;
47f46768 5663 }
b2cb805f
JA
5664 return NULL;
5665}
5666
9ba5fac8
PB
5667static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5668 bool poll_only)
e07785b0 5669 __must_hold(&ctx->completion_lock)
b2cb805f
JA
5670{
5671 struct io_kiocb *req;
5672
9ba5fac8 5673 req = io_poll_find(ctx, sqe_addr, poll_only);
b2cb805f
JA
5674 if (!req)
5675 return -ENOENT;
5676 if (io_poll_remove_one(req))
5677 return 0;
5678
5679 return -EALREADY;
47f46768
JA
5680}
5681
9096af3e
PB
5682static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5683 unsigned int flags)
5684{
5685 u32 events;
47f46768 5686
9096af3e
PB
5687 events = READ_ONCE(sqe->poll32_events);
5688#ifdef __BIG_ENDIAN
5689 events = swahw32(events);
5690#endif
5691 if (!(flags & IORING_POLL_ADD_MULTI))
5692 events |= EPOLLONESHOT;
5693 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
47f46768
JA
5694}
5695
c5de0036 5696static int io_poll_update_prep(struct io_kiocb *req,
3529d8c2 5697 const struct io_uring_sqe *sqe)
0969e783 5698{
c5de0036
PB
5699 struct io_poll_update *upd = &req->poll_update;
5700 u32 flags;
5701
0969e783
JA
5702 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5703 return -EINVAL;
26578cda 5704 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
c5de0036
PB
5705 return -EINVAL;
5706 flags = READ_ONCE(sqe->len);
5707 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5708 IORING_POLL_ADD_MULTI))
5709 return -EINVAL;
5710 /* meaningless without update */
5711 if (flags == IORING_POLL_ADD_MULTI)
0969e783
JA
5712 return -EINVAL;
5713
c5de0036
PB
5714 upd->old_user_data = READ_ONCE(sqe->addr);
5715 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5716 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
221c5eb2 5717
c5de0036
PB
5718 upd->new_user_data = READ_ONCE(sqe->off);
5719 if (!upd->update_user_data && upd->new_user_data)
5720 return -EINVAL;
5721 if (upd->update_events)
5722 upd->events = io_poll_parse_events(sqe, flags);
5723 else if (sqe->poll32_events)
5724 return -EINVAL;
221c5eb2 5725
221c5eb2
JA
5726 return 0;
5727}
5728
221c5eb2
JA
5729static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5730 void *key)
5731{
c2f2eb7d
JA
5732 struct io_kiocb *req = wait->private;
5733 struct io_poll_iocb *poll = &req->poll;
221c5eb2 5734
d7718a9d 5735 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
221c5eb2
JA
5736}
5737
221c5eb2
JA
5738static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5739 struct poll_table_struct *p)
5740{
5741 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5742
e8c2bc1f 5743 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
eac406c6
JA
5744}
5745
3529d8c2 5746static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
5747{
5748 struct io_poll_iocb *poll = &req->poll;
c5de0036 5749 u32 flags;
221c5eb2
JA
5750
5751 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5752 return -EINVAL;
c5de0036 5753 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
88e41cf9
JA
5754 return -EINVAL;
5755 flags = READ_ONCE(sqe->len);
c5de0036 5756 if (flags & ~IORING_POLL_ADD_MULTI)
221c5eb2
JA
5757 return -EINVAL;
5758
48dcd38d 5759 io_req_set_refcount(req);
c5de0036 5760 poll->events = io_poll_parse_events(sqe, flags);
0969e783
JA
5761 return 0;
5762}
5763
61e98203 5764static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
0969e783
JA
5765{
5766 struct io_poll_iocb *poll = &req->poll;
5767 struct io_ring_ctx *ctx = req->ctx;
5768 struct io_poll_table ipt;
0969e783 5769 __poll_t mask;
5b7aa38d 5770 bool done;
0969e783 5771
d7718a9d 5772 ipt.pt._qproc = io_poll_queue_proc;
36703247 5773
d7718a9d
JA
5774 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5775 io_poll_wake);
221c5eb2 5776
8c838788 5777 if (mask) { /* no async, we'd stolen it */
221c5eb2 5778 ipt.error = 0;
eb6e6f06
PB
5779 done = __io_poll_complete(req, mask);
5780 io_commit_cqring(req->ctx);
221c5eb2 5781 }
79ebeaee 5782 spin_unlock(&ctx->completion_lock);
221c5eb2 5783
8c838788
JA
5784 if (mask) {
5785 io_cqring_ev_posted(ctx);
5b7aa38d 5786 if (done)
88e41cf9 5787 io_put_req(req);
221c5eb2 5788 }
8c838788 5789 return ipt.error;
221c5eb2
JA
5790}
5791
c5de0036 5792static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
b69de288
JA
5793{
5794 struct io_ring_ctx *ctx = req->ctx;
5795 struct io_kiocb *preq;
cb3b200e 5796 bool completing;
b69de288
JA
5797 int ret;
5798
79ebeaee 5799 spin_lock(&ctx->completion_lock);
9ba5fac8 5800 preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
b69de288
JA
5801 if (!preq) {
5802 ret = -ENOENT;
5803 goto err;
b69de288 5804 }
cb3b200e 5805
c5de0036
PB
5806 if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5807 completing = true;
5808 ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5809 goto err;
5810 }
5811
cb3b200e
JA
5812 /*
5813 * Don't allow racy completion with singleshot, as we cannot safely
5814 * update those. For multishot, if we're racing with completion, just
5815 * let completion re-add it.
5816 */
5817 completing = !__io_poll_remove_one(preq, &preq->poll, false);
5818 if (completing && (preq->poll.events & EPOLLONESHOT)) {
5819 ret = -EALREADY;
5820 goto err;
b69de288
JA
5821 }
5822 /* we now have a detached poll request. reissue. */
5823 ret = 0;
5824err:
b69de288 5825 if (ret < 0) {
79ebeaee 5826 spin_unlock(&ctx->completion_lock);
93d2bcd2 5827 req_set_fail(req);
b69de288
JA
5828 io_req_complete(req, ret);
5829 return 0;
5830 }
5831 /* only mask one event flags, keep behavior flags */
9d805892 5832 if (req->poll_update.update_events) {
b69de288 5833 preq->poll.events &= ~0xffff;
9d805892 5834 preq->poll.events |= req->poll_update.events & 0xffff;
b69de288
JA
5835 preq->poll.events |= IO_POLL_UNMASK;
5836 }
9d805892
PB
5837 if (req->poll_update.update_user_data)
5838 preq->user_data = req->poll_update.new_user_data;
79ebeaee 5839 spin_unlock(&ctx->completion_lock);
cb3b200e 5840
b69de288
JA
5841 /* complete update request, we're done with it */
5842 io_req_complete(req, ret);
5843
cb3b200e 5844 if (!completing) {
c5de0036 5845 ret = io_poll_add(preq, issue_flags);
cb3b200e 5846 if (ret < 0) {
93d2bcd2 5847 req_set_fail(preq);
cb3b200e
JA
5848 io_req_complete(preq, ret);
5849 }
b69de288
JA
5850 }
5851 return 0;
5852}
5853
f237c30a 5854static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
89850fce 5855{
6224590d
PB
5856 struct io_timeout_data *data = req->async_data;
5857
5858 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
5859 req_set_fail(req);
505657bc 5860 io_req_complete_post(req, -ETIME, 0);
89850fce
JA
5861}
5862
5262f567
JA
5863static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5864{
ad8a48ac
JA
5865 struct io_timeout_data *data = container_of(timer,
5866 struct io_timeout_data, timer);
5867 struct io_kiocb *req = data->req;
5868 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
5869 unsigned long flags;
5870
89850fce 5871 spin_lock_irqsave(&ctx->timeout_lock, flags);
a71976f3 5872 list_del_init(&req->timeout.list);
01cec8c1
PB
5873 atomic_set(&req->ctx->cq_timeouts,
5874 atomic_read(&req->ctx->cq_timeouts) + 1);
89850fce 5875 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
01cec8c1 5876
89850fce
JA
5877 req->io_task_work.func = io_req_task_timeout;
5878 io_req_task_work_add(req);
5262f567
JA
5879 return HRTIMER_NORESTART;
5880}
5881
fbd15848
PB
5882static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5883 __u64 user_data)
89850fce 5884 __must_hold(&ctx->timeout_lock)
f254ac04 5885{
fbd15848 5886 struct io_timeout_data *io;
47f46768 5887 struct io_kiocb *req;
fd9c7bc5 5888 bool found = false;
f254ac04 5889
135fcde8 5890 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
fd9c7bc5
PB
5891 found = user_data == req->user_data;
5892 if (found)
47f46768 5893 break;
47f46768 5894 }
fd9c7bc5
PB
5895 if (!found)
5896 return ERR_PTR(-ENOENT);
fbd15848
PB
5897
5898 io = req->async_data;
fd9c7bc5 5899 if (hrtimer_try_to_cancel(&io->timer) == -1)
fbd15848 5900 return ERR_PTR(-EALREADY);
a71976f3 5901 list_del_init(&req->timeout.list);
fbd15848
PB
5902 return req;
5903}
47f46768 5904
fbd15848 5905static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
ec3c3d0f 5906 __must_hold(&ctx->completion_lock)
89850fce 5907 __must_hold(&ctx->timeout_lock)
fbd15848
PB
5908{
5909 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5910
5911 if (IS_ERR(req))
5912 return PTR_ERR(req);
f254ac04 5913
93d2bcd2 5914 req_set_fail(req);
d4d19c19 5915 io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
91c2f697 5916 io_put_req_deferred(req);
f254ac04
JA
5917 return 0;
5918}
5919
50c1df2b
JA
5920static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
5921{
5922 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
5923 case IORING_TIMEOUT_BOOTTIME:
5924 return CLOCK_BOOTTIME;
5925 case IORING_TIMEOUT_REALTIME:
5926 return CLOCK_REALTIME;
5927 default:
5928 /* can't happen, vetted at prep time */
5929 WARN_ON_ONCE(1);
5930 fallthrough;
5931 case 0:
5932 return CLOCK_MONOTONIC;
5933 }
5934}
5935
f1042b6c
PB
5936static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5937 struct timespec64 *ts, enum hrtimer_mode mode)
5938 __must_hold(&ctx->timeout_lock)
5939{
5940 struct io_timeout_data *io;
5941 struct io_kiocb *req;
5942 bool found = false;
5943
5944 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
5945 found = user_data == req->user_data;
5946 if (found)
5947 break;
5948 }
5949 if (!found)
5950 return -ENOENT;
5951
5952 io = req->async_data;
5953 if (hrtimer_try_to_cancel(&io->timer) == -1)
5954 return -EALREADY;
5955 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
5956 io->timer.function = io_link_timeout_fn;
5957 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
5958 return 0;
5959}
5960
9c8e11b3
PB
5961static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5962 struct timespec64 *ts, enum hrtimer_mode mode)
89850fce 5963 __must_hold(&ctx->timeout_lock)
47f46768 5964{
9c8e11b3
PB
5965 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5966 struct io_timeout_data *data;
47f46768 5967
9c8e11b3
PB
5968 if (IS_ERR(req))
5969 return PTR_ERR(req);
47f46768 5970
9c8e11b3
PB
5971 req->timeout.off = 0; /* noseq */
5972 data = req->async_data;
5973 list_add_tail(&req->timeout.list, &ctx->timeout_list);
50c1df2b 5974 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
9c8e11b3
PB
5975 data->timer.function = io_timeout_fn;
5976 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5977 return 0;
47f46768
JA
5978}
5979
3529d8c2
JA
5980static int io_timeout_remove_prep(struct io_kiocb *req,
5981 const struct io_uring_sqe *sqe)
b29472ee 5982{
9c8e11b3
PB
5983 struct io_timeout_rem *tr = &req->timeout_rem;
5984
b29472ee
JA
5985 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5986 return -EINVAL;
61710e43
DA
5987 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5988 return -EINVAL;
26578cda 5989 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
b29472ee
JA
5990 return -EINVAL;
5991
f1042b6c 5992 tr->ltimeout = false;
9c8e11b3
PB
5993 tr->addr = READ_ONCE(sqe->addr);
5994 tr->flags = READ_ONCE(sqe->timeout_flags);
f1042b6c
PB
5995 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
5996 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
5997 return -EINVAL;
5998 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
5999 tr->ltimeout = true;
6000 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
9c8e11b3
PB
6001 return -EINVAL;
6002 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6003 return -EFAULT;
6004 } else if (tr->flags) {
6005 /* timeout removal doesn't support flags */
b29472ee 6006 return -EINVAL;
9c8e11b3 6007 }
b29472ee 6008
b29472ee
JA
6009 return 0;
6010}
6011
8662daec
PB
6012static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6013{
6014 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6015 : HRTIMER_MODE_REL;
6016}
6017
11365043
JA
6018/*
6019 * Remove or update an existing timeout command
6020 */
61e98203 6021static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
11365043 6022{
9c8e11b3 6023 struct io_timeout_rem *tr = &req->timeout_rem;
11365043 6024 struct io_ring_ctx *ctx = req->ctx;
47f46768 6025 int ret;
11365043 6026
ec3c3d0f
PB
6027 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6028 spin_lock(&ctx->completion_lock);
6029 spin_lock_irq(&ctx->timeout_lock);
9c8e11b3 6030 ret = io_timeout_cancel(ctx, tr->addr);
ec3c3d0f
PB
6031 spin_unlock_irq(&ctx->timeout_lock);
6032 spin_unlock(&ctx->completion_lock);
6033 } else {
f1042b6c
PB
6034 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6035
ec3c3d0f 6036 spin_lock_irq(&ctx->timeout_lock);
f1042b6c
PB
6037 if (tr->ltimeout)
6038 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6039 else
6040 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
ec3c3d0f
PB
6041 spin_unlock_irq(&ctx->timeout_lock);
6042 }
11365043 6043
4e88d6e7 6044 if (ret < 0)
93d2bcd2 6045 req_set_fail(req);
505657bc 6046 io_req_complete_post(req, ret, 0);
11365043 6047 return 0;
5262f567
JA
6048}
6049
3529d8c2 6050static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 6051 bool is_timeout_link)
5262f567 6052{
ad8a48ac 6053 struct io_timeout_data *data;
a41525ab 6054 unsigned flags;
56080b02 6055 u32 off = READ_ONCE(sqe->off);
5262f567 6056
ad8a48ac 6057 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 6058 return -EINVAL;
26578cda
PB
6059 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6060 sqe->splice_fd_in)
a41525ab 6061 return -EINVAL;
56080b02 6062 if (off && is_timeout_link)
2d28390a 6063 return -EINVAL;
a41525ab 6064 flags = READ_ONCE(sqe->timeout_flags);
6224590d
PB
6065 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
6066 IORING_TIMEOUT_ETIME_SUCCESS))
50c1df2b
JA
6067 return -EINVAL;
6068 /* more than one clock specified is invalid, obviously */
6069 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
5262f567 6070 return -EINVAL;
bdf20073 6071
ef9dd637 6072 INIT_LIST_HEAD(&req->timeout.list);
bfe68a22 6073 req->timeout.off = off;
f18ee4cf
PB
6074 if (unlikely(off && !req->ctx->off_timeout_used))
6075 req->ctx->off_timeout_used = true;
26a61679 6076
d886e185 6077 if (!req_has_async_data(req) && io_alloc_async_data(req))
26a61679
JA
6078 return -ENOMEM;
6079
e8c2bc1f 6080 data = req->async_data;
ad8a48ac 6081 data->req = req;
50c1df2b 6082 data->flags = flags;
ad8a48ac
JA
6083
6084 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
6085 return -EFAULT;
6086
8662daec 6087 data->mode = io_translate_timeout_mode(flags);
50c1df2b 6088 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
b97e736a
PB
6089
6090 if (is_timeout_link) {
6091 struct io_submit_link *link = &req->ctx->submit_state.link;
6092
6093 if (!link->head)
6094 return -EINVAL;
6095 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6096 return -EINVAL;
4d13d1a4
PB
6097 req->timeout.head = link->last;
6098 link->last->flags |= REQ_F_ARM_LTIMEOUT;
b97e736a 6099 }
ad8a48ac
JA
6100 return 0;
6101}
6102
61e98203 6103static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
ad8a48ac 6104{
ad8a48ac 6105 struct io_ring_ctx *ctx = req->ctx;
e8c2bc1f 6106 struct io_timeout_data *data = req->async_data;
ad8a48ac 6107 struct list_head *entry;
bfe68a22 6108 u32 tail, off = req->timeout.off;
ad8a48ac 6109
89850fce 6110 spin_lock_irq(&ctx->timeout_lock);
93bd25bb 6111
5262f567
JA
6112 /*
6113 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
6114 * timeout event to be satisfied. If it isn't set, then this is
6115 * a pure timeout request, sequence isn't used.
5262f567 6116 */
8eb7e2d0 6117 if (io_is_timeout_noseq(req)) {
93bd25bb
JA
6118 entry = ctx->timeout_list.prev;
6119 goto add;
6120 }
5262f567 6121
bfe68a22
PB
6122 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6123 req->timeout.target_seq = tail + off;
5262f567 6124
f010505b
MDG
6125 /* Update the last seq here in case io_flush_timeouts() hasn't.
6126 * This is safe because ->completion_lock is held, and submissions
6127 * and completions are never mixed in the same ->completion_lock section.
6128 */
6129 ctx->cq_last_tm_flush = tail;
6130
5262f567
JA
6131 /*
6132 * Insertion sort, ensuring the first entry in the list is always
6133 * the one we need first.
6134 */
5262f567 6135 list_for_each_prev(entry, &ctx->timeout_list) {
135fcde8
PB
6136 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6137 timeout.list);
5262f567 6138
8eb7e2d0 6139 if (io_is_timeout_noseq(nxt))
93bd25bb 6140 continue;
bfe68a22
PB
6141 /* nxt.seq is behind @tail, otherwise would've been completed */
6142 if (off >= nxt->timeout.target_seq - tail)
5262f567
JA
6143 break;
6144 }
93bd25bb 6145add:
135fcde8 6146 list_add(&req->timeout.list, entry);
ad8a48ac
JA
6147 data->timer.function = io_timeout_fn;
6148 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
89850fce 6149 spin_unlock_irq(&ctx->timeout_lock);
5262f567
JA
6150 return 0;
6151}
5262f567 6152
f458dd84
PB
6153struct io_cancel_data {
6154 struct io_ring_ctx *ctx;
6155 u64 user_data;
6156};
6157
62755e35
JA
6158static bool io_cancel_cb(struct io_wq_work *work, void *data)
6159{
6160 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f458dd84 6161 struct io_cancel_data *cd = data;
62755e35 6162
f458dd84 6163 return req->ctx == cd->ctx && req->user_data == cd->user_data;
62755e35
JA
6164}
6165
f458dd84
PB
6166static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6167 struct io_ring_ctx *ctx)
62755e35 6168{
f458dd84 6169 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
62755e35 6170 enum io_wq_cancel cancel_ret;
62755e35
JA
6171 int ret = 0;
6172
f458dd84 6173 if (!tctx || !tctx->io_wq)
5aa75ed5
JA
6174 return -ENOENT;
6175
f458dd84 6176 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
62755e35
JA
6177 switch (cancel_ret) {
6178 case IO_WQ_CANCEL_OK:
6179 ret = 0;
6180 break;
6181 case IO_WQ_CANCEL_RUNNING:
6182 ret = -EALREADY;
6183 break;
6184 case IO_WQ_CANCEL_NOTFOUND:
6185 ret = -ENOENT;
6186 break;
6187 }
6188
e977d6d3
JA
6189 return ret;
6190}
6191
8cb01fac 6192static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
47f46768 6193{
8cb01fac 6194 struct io_ring_ctx *ctx = req->ctx;
47f46768
JA
6195 int ret;
6196
dadebc35 6197 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
8cb01fac 6198
f458dd84 6199 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
df9727af 6200 if (ret != -ENOENT)
8cb01fac 6201 return ret;
505657bc
PB
6202
6203 spin_lock(&ctx->completion_lock);
79ebeaee 6204 spin_lock_irq(&ctx->timeout_lock);
47f46768 6205 ret = io_timeout_cancel(ctx, sqe_addr);
79ebeaee 6206 spin_unlock_irq(&ctx->timeout_lock);
47f46768 6207 if (ret != -ENOENT)
505657bc
PB
6208 goto out;
6209 ret = io_poll_cancel(ctx, sqe_addr, false);
6210out:
6211 spin_unlock(&ctx->completion_lock);
6212 return ret;
47f46768
JA
6213}
6214
3529d8c2
JA
6215static int io_async_cancel_prep(struct io_kiocb *req,
6216 const struct io_uring_sqe *sqe)
e977d6d3 6217{
fbf23849 6218 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3 6219 return -EINVAL;
61710e43
DA
6220 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6221 return -EINVAL;
26578cda
PB
6222 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6223 sqe->splice_fd_in)
e977d6d3
JA
6224 return -EINVAL;
6225
fbf23849
JA
6226 req->cancel.addr = READ_ONCE(sqe->addr);
6227 return 0;
6228}
6229
61e98203 6230static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6231{
6232 struct io_ring_ctx *ctx = req->ctx;
58f99373
PB
6233 u64 sqe_addr = req->cancel.addr;
6234 struct io_tctx_node *node;
6235 int ret;
6236
8cb01fac 6237 ret = io_try_cancel_userdata(req, sqe_addr);
58f99373
PB
6238 if (ret != -ENOENT)
6239 goto done;
58f99373
PB
6240
6241 /* slow path, try all io-wq's */
6242 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6243 ret = -ENOENT;
6244 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6245 struct io_uring_task *tctx = node->task->io_uring;
fbf23849 6246
58f99373
PB
6247 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6248 if (ret != -ENOENT)
6249 break;
6250 }
6251 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
58f99373 6252done:
58f99373 6253 if (ret < 0)
93d2bcd2 6254 req_set_fail(req);
505657bc 6255 io_req_complete_post(req, ret, 0);
5262f567
JA
6256 return 0;
6257}
6258
269bbe5f 6259static int io_rsrc_update_prep(struct io_kiocb *req,
05f3fb3c
JA
6260 const struct io_uring_sqe *sqe)
6261{
61710e43
DA
6262 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6263 return -EINVAL;
26578cda 6264 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
05f3fb3c
JA
6265 return -EINVAL;
6266
269bbe5f
BM
6267 req->rsrc_update.offset = READ_ONCE(sqe->off);
6268 req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6269 if (!req->rsrc_update.nr_args)
05f3fb3c 6270 return -EINVAL;
269bbe5f 6271 req->rsrc_update.arg = READ_ONCE(sqe->addr);
05f3fb3c
JA
6272 return 0;
6273}
6274
889fca73 6275static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6276{
6277 struct io_ring_ctx *ctx = req->ctx;
c3bdad02 6278 struct io_uring_rsrc_update2 up;
05f3fb3c 6279 int ret;
fbf23849 6280
269bbe5f
BM
6281 up.offset = req->rsrc_update.offset;
6282 up.data = req->rsrc_update.arg;
c3bdad02
PB
6283 up.nr = 0;
6284 up.tags = 0;
615cee49 6285 up.resv = 0;
05f3fb3c 6286
cdb31c29 6287 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
fdecb662 6288 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
98f0b3b4 6289 &up, req->rsrc_update.nr_args);
cdb31c29 6290 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
05f3fb3c
JA
6291
6292 if (ret < 0)
93d2bcd2 6293 req_set_fail(req);
889fca73 6294 __io_req_complete(req, issue_flags, ret, 0);
5262f567
JA
6295 return 0;
6296}
6297
bfe76559 6298static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 6299{
d625c6ee 6300 switch (req->opcode) {
e781573e 6301 case IORING_OP_NOP:
bfe76559 6302 return 0;
f67676d1
JA
6303 case IORING_OP_READV:
6304 case IORING_OP_READ_FIXED:
3a6820f2 6305 case IORING_OP_READ:
bfe76559 6306 return io_read_prep(req, sqe);
f67676d1
JA
6307 case IORING_OP_WRITEV:
6308 case IORING_OP_WRITE_FIXED:
3a6820f2 6309 case IORING_OP_WRITE:
bfe76559 6310 return io_write_prep(req, sqe);
0969e783 6311 case IORING_OP_POLL_ADD:
bfe76559 6312 return io_poll_add_prep(req, sqe);
0969e783 6313 case IORING_OP_POLL_REMOVE:
c5de0036 6314 return io_poll_update_prep(req, sqe);
8ed8d3c3 6315 case IORING_OP_FSYNC:
1155c76a 6316 return io_fsync_prep(req, sqe);
8ed8d3c3 6317 case IORING_OP_SYNC_FILE_RANGE:
1155c76a 6318 return io_sfr_prep(req, sqe);
03b1230c 6319 case IORING_OP_SENDMSG:
fddaface 6320 case IORING_OP_SEND:
bfe76559 6321 return io_sendmsg_prep(req, sqe);
03b1230c 6322 case IORING_OP_RECVMSG:
fddaface 6323 case IORING_OP_RECV:
bfe76559 6324 return io_recvmsg_prep(req, sqe);
f499a021 6325 case IORING_OP_CONNECT:
bfe76559 6326 return io_connect_prep(req, sqe);
2d28390a 6327 case IORING_OP_TIMEOUT:
bfe76559 6328 return io_timeout_prep(req, sqe, false);
b29472ee 6329 case IORING_OP_TIMEOUT_REMOVE:
bfe76559 6330 return io_timeout_remove_prep(req, sqe);
fbf23849 6331 case IORING_OP_ASYNC_CANCEL:
bfe76559 6332 return io_async_cancel_prep(req, sqe);
2d28390a 6333 case IORING_OP_LINK_TIMEOUT:
bfe76559 6334 return io_timeout_prep(req, sqe, true);
8ed8d3c3 6335 case IORING_OP_ACCEPT:
bfe76559 6336 return io_accept_prep(req, sqe);
d63d1b5e 6337 case IORING_OP_FALLOCATE:
bfe76559 6338 return io_fallocate_prep(req, sqe);
15b71abe 6339 case IORING_OP_OPENAT:
bfe76559 6340 return io_openat_prep(req, sqe);
b5dba59e 6341 case IORING_OP_CLOSE:
bfe76559 6342 return io_close_prep(req, sqe);
05f3fb3c 6343 case IORING_OP_FILES_UPDATE:
269bbe5f 6344 return io_rsrc_update_prep(req, sqe);
eddc7ef5 6345 case IORING_OP_STATX:
bfe76559 6346 return io_statx_prep(req, sqe);
4840e418 6347 case IORING_OP_FADVISE:
bfe76559 6348 return io_fadvise_prep(req, sqe);
c1ca757b 6349 case IORING_OP_MADVISE:
bfe76559 6350 return io_madvise_prep(req, sqe);
cebdb986 6351 case IORING_OP_OPENAT2:
bfe76559 6352 return io_openat2_prep(req, sqe);
3e4827b0 6353 case IORING_OP_EPOLL_CTL:
bfe76559 6354 return io_epoll_ctl_prep(req, sqe);
7d67af2c 6355 case IORING_OP_SPLICE:
bfe76559 6356 return io_splice_prep(req, sqe);
ddf0322d 6357 case IORING_OP_PROVIDE_BUFFERS:
bfe76559 6358 return io_provide_buffers_prep(req, sqe);
067524e9 6359 case IORING_OP_REMOVE_BUFFERS:
bfe76559 6360 return io_remove_buffers_prep(req, sqe);
f2a8d5c7 6361 case IORING_OP_TEE:
bfe76559 6362 return io_tee_prep(req, sqe);
36f4fa68
JA
6363 case IORING_OP_SHUTDOWN:
6364 return io_shutdown_prep(req, sqe);
80a261fd
JA
6365 case IORING_OP_RENAMEAT:
6366 return io_renameat_prep(req, sqe);
14a1143b
JA
6367 case IORING_OP_UNLINKAT:
6368 return io_unlinkat_prep(req, sqe);
e34a02dc
DK
6369 case IORING_OP_MKDIRAT:
6370 return io_mkdirat_prep(req, sqe);
7a8721f8
DK
6371 case IORING_OP_SYMLINKAT:
6372 return io_symlinkat_prep(req, sqe);
cf30da90
DK
6373 case IORING_OP_LINKAT:
6374 return io_linkat_prep(req, sqe);
f67676d1
JA
6375 }
6376
bfe76559
PB
6377 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6378 req->opcode);
bd54b6fe 6379 return -EINVAL;
bfe76559
PB
6380}
6381
93642ef8 6382static int io_req_prep_async(struct io_kiocb *req)
bfe76559 6383{
b7e298d2
PB
6384 if (!io_op_defs[req->opcode].needs_async_setup)
6385 return 0;
d886e185 6386 if (WARN_ON_ONCE(req_has_async_data(req)))
b7e298d2
PB
6387 return -EFAULT;
6388 if (io_alloc_async_data(req))
6389 return -EAGAIN;
6390
93642ef8
PB
6391 switch (req->opcode) {
6392 case IORING_OP_READV:
93642ef8
PB
6393 return io_rw_prep_async(req, READ);
6394 case IORING_OP_WRITEV:
93642ef8
PB
6395 return io_rw_prep_async(req, WRITE);
6396 case IORING_OP_SENDMSG:
93642ef8
PB
6397 return io_sendmsg_prep_async(req);
6398 case IORING_OP_RECVMSG:
93642ef8
PB
6399 return io_recvmsg_prep_async(req);
6400 case IORING_OP_CONNECT:
6401 return io_connect_prep_async(req);
6402 }
b7e298d2
PB
6403 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6404 req->opcode);
6405 return -EFAULT;
f67676d1
JA
6406}
6407
9cf7c104
PB
6408static u32 io_get_sequence(struct io_kiocb *req)
6409{
a3dbdf54 6410 u32 seq = req->ctx->cached_sq_head;
9cf7c104 6411
a3dbdf54
PB
6412 /* need original cached_sq_head, but it was increased for each req */
6413 io_for_each_link(req, req)
6414 seq--;
6415 return seq;
9cf7c104
PB
6416}
6417
c072481d 6418static __cold void io_drain_req(struct io_kiocb *req)
de0617e4 6419{
a197f664 6420 struct io_ring_ctx *ctx = req->ctx;
27dc8338 6421 struct io_defer_entry *de;
f67676d1 6422 int ret;
e0eb71dc 6423 u32 seq = io_get_sequence(req);
de0617e4 6424
9d858b21 6425 /* Still need defer if there is pending req in defer list. */
5e371265 6426 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
e0eb71dc 6427queue:
5e371265 6428 ctx->drain_active = false;
e0eb71dc
PB
6429 io_req_task_queue(req);
6430 return;
5e371265 6431 }
de0617e4 6432
b7e298d2 6433 ret = io_req_prep_async(req);
e0eb71dc
PB
6434 if (ret) {
6435fail:
6436 io_req_complete_failed(req, ret);
6437 return;
6438 }
cbdcb435 6439 io_prep_async_link(req);
27dc8338 6440 de = kmalloc(sizeof(*de), GFP_KERNEL);
76cc33d7 6441 if (!de) {
1b48773f 6442 ret = -ENOMEM;
e0eb71dc 6443 goto fail;
76cc33d7 6444 }
2d28390a 6445
79ebeaee 6446 spin_lock(&ctx->completion_lock);
9cf7c104 6447 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
79ebeaee 6448 spin_unlock(&ctx->completion_lock);
27dc8338 6449 kfree(de);
e0eb71dc 6450 goto queue;
de0617e4
JA
6451 }
6452
915967f6 6453 trace_io_uring_defer(ctx, req, req->user_data);
27dc8338 6454 de->req = req;
9cf7c104 6455 de->seq = seq;
27dc8338 6456 list_add_tail(&de->list, &ctx->defer_list);
79ebeaee 6457 spin_unlock(&ctx->completion_lock);
de0617e4
JA
6458}
6459
68fb8979 6460static void io_clean_op(struct io_kiocb *req)
99bc4c38 6461{
0e1b6fe3 6462 if (req->flags & REQ_F_BUFFER_SELECTED) {
30d51dd4
PB
6463 kfree(req->kbuf);
6464 req->kbuf = NULL;
99bc4c38
PB
6465 }
6466
0e1b6fe3
PB
6467 if (req->flags & REQ_F_NEED_CLEANUP) {
6468 switch (req->opcode) {
6469 case IORING_OP_READV:
6470 case IORING_OP_READ_FIXED:
6471 case IORING_OP_READ:
6472 case IORING_OP_WRITEV:
6473 case IORING_OP_WRITE_FIXED:
e8c2bc1f
JA
6474 case IORING_OP_WRITE: {
6475 struct io_async_rw *io = req->async_data;
1dacb4df
PB
6476
6477 kfree(io->free_iovec);
0e1b6fe3 6478 break;
e8c2bc1f 6479 }
0e1b6fe3 6480 case IORING_OP_RECVMSG:
e8c2bc1f
JA
6481 case IORING_OP_SENDMSG: {
6482 struct io_async_msghdr *io = req->async_data;
257e84a5
PB
6483
6484 kfree(io->free_iov);
0e1b6fe3 6485 break;
e8c2bc1f 6486 }
0e1b6fe3
PB
6487 case IORING_OP_SPLICE:
6488 case IORING_OP_TEE:
e1d767f0
PB
6489 if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
6490 io_put_file(req->splice.file_in);
0e1b6fe3 6491 break;
f3cd4850
JA
6492 case IORING_OP_OPENAT:
6493 case IORING_OP_OPENAT2:
6494 if (req->open.filename)
6495 putname(req->open.filename);
6496 break;
80a261fd
JA
6497 case IORING_OP_RENAMEAT:
6498 putname(req->rename.oldpath);
6499 putname(req->rename.newpath);
6500 break;
14a1143b
JA
6501 case IORING_OP_UNLINKAT:
6502 putname(req->unlink.filename);
6503 break;
e34a02dc
DK
6504 case IORING_OP_MKDIRAT:
6505 putname(req->mkdir.filename);
6506 break;
7a8721f8
DK
6507 case IORING_OP_SYMLINKAT:
6508 putname(req->symlink.oldpath);
6509 putname(req->symlink.newpath);
6510 break;
cf30da90
DK
6511 case IORING_OP_LINKAT:
6512 putname(req->hardlink.oldpath);
6513 putname(req->hardlink.newpath);
6514 break;
0e1b6fe3 6515 }
99bc4c38 6516 }
75652a30
JA
6517 if ((req->flags & REQ_F_POLLED) && req->apoll) {
6518 kfree(req->apoll->double_poll);
6519 kfree(req->apoll);
6520 req->apoll = NULL;
6521 }
3a0a6902
PB
6522 if (req->flags & REQ_F_INFLIGHT) {
6523 struct io_uring_task *tctx = req->task->io_uring;
6524
6525 atomic_dec(&tctx->inflight_tracked);
3a0a6902 6526 }
c854357b 6527 if (req->flags & REQ_F_CREDS)
b8e64b53 6528 put_cred(req->creds);
d886e185
PB
6529 if (req->flags & REQ_F_ASYNC_DATA) {
6530 kfree(req->async_data);
6531 req->async_data = NULL;
6532 }
c854357b 6533 req->flags &= ~IO_REQ_CLEAN_FLAGS;
99bc4c38
PB
6534}
6535
889fca73 6536static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 6537{
a197f664 6538 struct io_ring_ctx *ctx = req->ctx;
5730b27e 6539 const struct cred *creds = NULL;
d625c6ee 6540 int ret;
2b188cc1 6541
6878b40e 6542 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
c10d1f98 6543 creds = override_creds(req->creds);
5730b27e 6544
d625c6ee 6545 switch (req->opcode) {
2b188cc1 6546 case IORING_OP_NOP:
889fca73 6547 ret = io_nop(req, issue_flags);
2b188cc1
JA
6548 break;
6549 case IORING_OP_READV:
edafccee 6550 case IORING_OP_READ_FIXED:
3a6820f2 6551 case IORING_OP_READ:
889fca73 6552 ret = io_read(req, issue_flags);
edafccee 6553 break;
3529d8c2 6554 case IORING_OP_WRITEV:
edafccee 6555 case IORING_OP_WRITE_FIXED:
3a6820f2 6556 case IORING_OP_WRITE:
889fca73 6557 ret = io_write(req, issue_flags);
2b188cc1 6558 break;
c992fe29 6559 case IORING_OP_FSYNC:
45d189c6 6560 ret = io_fsync(req, issue_flags);
c992fe29 6561 break;
221c5eb2 6562 case IORING_OP_POLL_ADD:
61e98203 6563 ret = io_poll_add(req, issue_flags);
221c5eb2
JA
6564 break;
6565 case IORING_OP_POLL_REMOVE:
c5de0036 6566 ret = io_poll_update(req, issue_flags);
221c5eb2 6567 break;
5d17b4a4 6568 case IORING_OP_SYNC_FILE_RANGE:
45d189c6 6569 ret = io_sync_file_range(req, issue_flags);
5d17b4a4 6570 break;
0fa03c62 6571 case IORING_OP_SENDMSG:
889fca73 6572 ret = io_sendmsg(req, issue_flags);
062d04d7 6573 break;
fddaface 6574 case IORING_OP_SEND:
889fca73 6575 ret = io_send(req, issue_flags);
0fa03c62 6576 break;
aa1fa28f 6577 case IORING_OP_RECVMSG:
889fca73 6578 ret = io_recvmsg(req, issue_flags);
062d04d7 6579 break;
fddaface 6580 case IORING_OP_RECV:
889fca73 6581 ret = io_recv(req, issue_flags);
aa1fa28f 6582 break;
5262f567 6583 case IORING_OP_TIMEOUT:
61e98203 6584 ret = io_timeout(req, issue_flags);
5262f567 6585 break;
11365043 6586 case IORING_OP_TIMEOUT_REMOVE:
61e98203 6587 ret = io_timeout_remove(req, issue_flags);
11365043 6588 break;
17f2fe35 6589 case IORING_OP_ACCEPT:
889fca73 6590 ret = io_accept(req, issue_flags);
17f2fe35 6591 break;
f8e85cf2 6592 case IORING_OP_CONNECT:
889fca73 6593 ret = io_connect(req, issue_flags);
f8e85cf2 6594 break;
62755e35 6595 case IORING_OP_ASYNC_CANCEL:
61e98203 6596 ret = io_async_cancel(req, issue_flags);
62755e35 6597 break;
d63d1b5e 6598 case IORING_OP_FALLOCATE:
45d189c6 6599 ret = io_fallocate(req, issue_flags);
d63d1b5e 6600 break;
15b71abe 6601 case IORING_OP_OPENAT:
45d189c6 6602 ret = io_openat(req, issue_flags);
15b71abe 6603 break;
b5dba59e 6604 case IORING_OP_CLOSE:
889fca73 6605 ret = io_close(req, issue_flags);
b5dba59e 6606 break;
05f3fb3c 6607 case IORING_OP_FILES_UPDATE:
889fca73 6608 ret = io_files_update(req, issue_flags);
05f3fb3c 6609 break;
eddc7ef5 6610 case IORING_OP_STATX:
45d189c6 6611 ret = io_statx(req, issue_flags);
eddc7ef5 6612 break;
4840e418 6613 case IORING_OP_FADVISE:
45d189c6 6614 ret = io_fadvise(req, issue_flags);
4840e418 6615 break;
c1ca757b 6616 case IORING_OP_MADVISE:
45d189c6 6617 ret = io_madvise(req, issue_flags);
c1ca757b 6618 break;
cebdb986 6619 case IORING_OP_OPENAT2:
45d189c6 6620 ret = io_openat2(req, issue_flags);
cebdb986 6621 break;
3e4827b0 6622 case IORING_OP_EPOLL_CTL:
889fca73 6623 ret = io_epoll_ctl(req, issue_flags);
3e4827b0 6624 break;
7d67af2c 6625 case IORING_OP_SPLICE:
45d189c6 6626 ret = io_splice(req, issue_flags);
7d67af2c 6627 break;
ddf0322d 6628 case IORING_OP_PROVIDE_BUFFERS:
889fca73 6629 ret = io_provide_buffers(req, issue_flags);
ddf0322d 6630 break;
067524e9 6631 case IORING_OP_REMOVE_BUFFERS:
889fca73 6632 ret = io_remove_buffers(req, issue_flags);
3e4827b0 6633 break;
f2a8d5c7 6634 case IORING_OP_TEE:
45d189c6 6635 ret = io_tee(req, issue_flags);
f2a8d5c7 6636 break;
36f4fa68 6637 case IORING_OP_SHUTDOWN:
45d189c6 6638 ret = io_shutdown(req, issue_flags);
36f4fa68 6639 break;
80a261fd 6640 case IORING_OP_RENAMEAT:
45d189c6 6641 ret = io_renameat(req, issue_flags);
80a261fd 6642 break;
14a1143b 6643 case IORING_OP_UNLINKAT:
45d189c6 6644 ret = io_unlinkat(req, issue_flags);
14a1143b 6645 break;
e34a02dc
DK
6646 case IORING_OP_MKDIRAT:
6647 ret = io_mkdirat(req, issue_flags);
6648 break;
7a8721f8
DK
6649 case IORING_OP_SYMLINKAT:
6650 ret = io_symlinkat(req, issue_flags);
6651 break;
cf30da90
DK
6652 case IORING_OP_LINKAT:
6653 ret = io_linkat(req, issue_flags);
6654 break;
2b188cc1
JA
6655 default:
6656 ret = -EINVAL;
6657 break;
6658 }
6659
5730b27e
JA
6660 if (creds)
6661 revert_creds(creds);
def596e9
JA
6662 if (ret)
6663 return ret;
b532576e 6664 /* If the op doesn't have a file, we're not polling for it */
cb3d8972
PB
6665 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
6666 io_iopoll_req_issued(req);
def596e9
JA
6667
6668 return 0;
2b188cc1
JA
6669}
6670
ebc11b6c
PB
6671static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
6672{
6673 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6674
6675 req = io_put_req_find_next(req);
6676 return req ? &req->work : NULL;
6677}
6678
5280f7e5 6679static void io_wq_submit_work(struct io_wq_work *work)
2b188cc1
JA
6680{
6681 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6df1db6b 6682 struct io_kiocb *timeout;
561fb04a 6683 int ret = 0;
2b188cc1 6684
48dcd38d
PB
6685 /* one will be dropped by ->io_free_work() after returning to io-wq */
6686 if (!(req->flags & REQ_F_REFCOUNT))
6687 __io_req_set_refcount(req, 2);
6688 else
6689 req_ref_get(req);
5d5901a3 6690
6df1db6b
PB
6691 timeout = io_prep_linked_timeout(req);
6692 if (timeout)
6693 io_queue_linked_timeout(timeout);
d4c81f38 6694
dadebc35 6695 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
4014d943 6696 if (work->flags & IO_WQ_WORK_CANCEL)
561fb04a 6697 ret = -ECANCELED;
31b51510 6698
561fb04a 6699 if (!ret) {
561fb04a 6700 do {
889fca73 6701 ret = io_issue_sqe(req, 0);
561fb04a
JA
6702 /*
6703 * We can get EAGAIN for polled IO even though we're
6704 * forcing a sync submission from here, since we can't
6705 * wait for request slots on the block side.
6706 */
6707 if (ret != -EAGAIN)
6708 break;
6709 cond_resched();
6710 } while (1);
6711 }
31b51510 6712
a3df7698 6713 /* avoid locking problems by failing it from a clean context */
5d5901a3 6714 if (ret)
a3df7698 6715 io_req_task_queue_fail(req, ret);
2b188cc1
JA
6716}
6717
aeca241b 6718static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
042b0d85 6719 unsigned i)
65e19f54 6720{
042b0d85 6721 return &table->files[i];
dafecf19
PB
6722}
6723
65e19f54
JA
6724static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6725 int index)
6726{
aeca241b 6727 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
65e19f54 6728
a04b0ac0 6729 return (struct file *) (slot->file_ptr & FFS_MASK);
65e19f54
JA
6730}
6731
a04b0ac0 6732static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
9a321c98
PB
6733{
6734 unsigned long file_ptr = (unsigned long) file;
6735
b191e2df 6736 if (__io_file_supports_nowait(file, READ))
9a321c98 6737 file_ptr |= FFS_ASYNC_READ;
b191e2df 6738 if (__io_file_supports_nowait(file, WRITE))
9a321c98
PB
6739 file_ptr |= FFS_ASYNC_WRITE;
6740 if (S_ISREG(file_inode(file)->i_mode))
6741 file_ptr |= FFS_ISREG;
a04b0ac0 6742 file_slot->file_ptr = file_ptr;
65e19f54
JA
6743}
6744
ac177053
PB
6745static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
6746 struct io_kiocb *req, int fd)
09bb8394 6747{
8da11c19 6748 struct file *file;
ac177053 6749 unsigned long file_ptr;
09bb8394 6750
ac177053
PB
6751 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6752 return NULL;
6753 fd = array_index_nospec(fd, ctx->nr_user_files);
6754 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
6755 file = (struct file *) (file_ptr & FFS_MASK);
6756 file_ptr &= ~FFS_MASK;
6757 /* mask in overlapping REQ_F and FFS bits */
b191e2df 6758 req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
ac177053
PB
6759 io_req_set_rsrc_node(req);
6760 return file;
6761}
d44f554e 6762
ac177053 6763static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
ac177053
PB
6764 struct io_kiocb *req, int fd)
6765{
62906e89 6766 struct file *file = fget(fd);
ac177053
PB
6767
6768 trace_io_uring_file_get(ctx, fd);
09bb8394 6769
ac177053
PB
6770 /* we don't allow fixed io_uring files */
6771 if (file && unlikely(file->f_op == &io_uring_fops))
6772 io_req_track_inflight(req);
8371adf5 6773 return file;
09bb8394
JA
6774}
6775
ac177053 6776static inline struct file *io_file_get(struct io_ring_ctx *ctx,
ac177053
PB
6777 struct io_kiocb *req, int fd, bool fixed)
6778{
6779 if (fixed)
6780 return io_file_get_fixed(ctx, req, fd);
6781 else
62906e89 6782 return io_file_get_normal(ctx, req, fd);
ac177053
PB
6783}
6784
f237c30a 6785static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
89b263f6
JA
6786{
6787 struct io_kiocb *prev = req->timeout.prev;
8cb01fac 6788 int ret;
89b263f6
JA
6789
6790 if (prev) {
8cb01fac 6791 ret = io_try_cancel_userdata(req, prev->user_data);
505657bc 6792 io_req_complete_post(req, ret ?: -ETIME, 0);
89b263f6 6793 io_put_req(prev);
89b263f6
JA
6794 } else {
6795 io_req_complete_post(req, -ETIME, 0);
6796 }
6797}
6798
2665abfd 6799static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 6800{
ad8a48ac
JA
6801 struct io_timeout_data *data = container_of(timer,
6802 struct io_timeout_data, timer);
90cd7e42 6803 struct io_kiocb *prev, *req = data->req;
2665abfd 6804 struct io_ring_ctx *ctx = req->ctx;
2665abfd 6805 unsigned long flags;
2665abfd 6806
89b263f6 6807 spin_lock_irqsave(&ctx->timeout_lock, flags);
90cd7e42
PB
6808 prev = req->timeout.head;
6809 req->timeout.head = NULL;
2665abfd
JA
6810
6811 /*
6812 * We don't expect the list to be empty, that will only happen if we
6813 * race with the completion of the linked work.
6814 */
447c19f3 6815 if (prev) {
f2f87370 6816 io_remove_next_linked(prev);
447c19f3
PB
6817 if (!req_ref_inc_not_zero(prev))
6818 prev = NULL;
6819 }
ef9dd637 6820 list_del(&req->timeout.list);
89b263f6
JA
6821 req->timeout.prev = prev;
6822 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
2665abfd 6823
89b263f6
JA
6824 req->io_task_work.func = io_req_task_link_timeout;
6825 io_req_task_work_add(req);
2665abfd
JA
6826 return HRTIMER_NORESTART;
6827}
6828
de968c18 6829static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 6830{
de968c18
PB
6831 struct io_ring_ctx *ctx = req->ctx;
6832
89b263f6 6833 spin_lock_irq(&ctx->timeout_lock);
76a46e06 6834 /*
f2f87370
PB
6835 * If the back reference is NULL, then our linked request finished
6836 * before we got a chance to setup the timer
76a46e06 6837 */
90cd7e42 6838 if (req->timeout.head) {
e8c2bc1f 6839 struct io_timeout_data *data = req->async_data;
94ae5e77 6840
ad8a48ac
JA
6841 data->timer.function = io_link_timeout_fn;
6842 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6843 data->mode);
ef9dd637 6844 list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
2665abfd 6845 }
89b263f6 6846 spin_unlock_irq(&ctx->timeout_lock);
2665abfd 6847 /* drop submission reference */
76a46e06
JA
6848 io_put_req(req);
6849}
2665abfd 6850
d475a9a6
PB
6851static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
6852 __must_hold(&req->ctx->uring_lock)
6853{
6854 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6855
6856 switch (io_arm_poll_handler(req)) {
6857 case IO_APOLL_READY:
6858 if (linked_timeout) {
6859 io_unprep_linked_timeout(req);
6860 linked_timeout = NULL;
6861 }
6862 io_req_task_queue(req);
6863 break;
6864 case IO_APOLL_ABORTED:
6865 /*
6866 * Queued up for async execution, worker will release
6867 * submit reference when the iocb is actually submitted.
6868 */
6869 io_queue_async_work(req, NULL);
6870 break;
6871 }
6872
6873 if (linked_timeout)
6874 io_queue_linked_timeout(linked_timeout);
6875}
6876
6877static inline void __io_queue_sqe(struct io_kiocb *req)
282cdc86 6878 __must_hold(&req->ctx->uring_lock)
2b188cc1 6879{
906c6caa 6880 struct io_kiocb *linked_timeout;
e0c5c576 6881 int ret;
2b188cc1 6882
c5eef2b9 6883 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
193155c8 6884
fff4e40e
PB
6885 if (req->flags & REQ_F_COMPLETE_INLINE) {
6886 io_req_add_compl_list(req);
d9f9d284 6887 return;
fff4e40e 6888 }
491381ce
JA
6889 /*
6890 * We async punt it if the file wasn't marked NOWAIT, or if the file
6891 * doesn't support non-blocking read/write attempts
6892 */
1840038e 6893 if (likely(!ret)) {
906c6caa
PB
6894 linked_timeout = io_prep_linked_timeout(req);
6895 if (linked_timeout)
6896 io_queue_linked_timeout(linked_timeout);
1840038e 6897 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
d475a9a6 6898 io_queue_sqe_arm_apoll(req);
0d63c148 6899 } else {
f41db273 6900 io_req_complete_failed(req, ret);
9e645e11 6901 }
2b188cc1
JA
6902}
6903
4652fe3f 6904static void io_queue_sqe_fallback(struct io_kiocb *req)
282cdc86 6905 __must_hold(&req->ctx->uring_lock)
4fe2c963 6906{
4652fe3f 6907 if (req->flags & REQ_F_FAIL) {
c6d3d9cb 6908 io_req_complete_fail_submit(req);
e0eb71dc
PB
6909 } else if (unlikely(req->ctx->drain_active)) {
6910 io_drain_req(req);
76cc33d7
PB
6911 } else {
6912 int ret = io_req_prep_async(req);
6913
6914 if (unlikely(ret))
6915 io_req_complete_failed(req, ret);
6916 else
f237c30a 6917 io_queue_async_work(req, NULL);
ce35a47a 6918 }
4fe2c963
JL
6919}
6920
4652fe3f
PB
6921static inline void io_queue_sqe(struct io_kiocb *req)
6922 __must_hold(&req->ctx->uring_lock)
6923{
6924 if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
6925 __io_queue_sqe(req);
6926 else
6927 io_queue_sqe_fallback(req);
6928}
6929
b16fed66
PB
6930/*
6931 * Check SQE restrictions (opcode and flags).
6932 *
6933 * Returns 'true' if SQE is allowed, 'false' otherwise.
6934 */
6935static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6936 struct io_kiocb *req,
6937 unsigned int sqe_flags)
4fe2c963 6938{
b16fed66
PB
6939 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6940 return false;
6941
6942 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6943 ctx->restrictions.sqe_flags_required)
6944 return false;
6945
6946 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6947 ctx->restrictions.sqe_flags_required))
6948 return false;
6949
6950 return true;
4fe2c963
JL
6951}
6952
22b2ca31
PB
6953static void io_init_req_drain(struct io_kiocb *req)
6954{
6955 struct io_ring_ctx *ctx = req->ctx;
6956 struct io_kiocb *head = ctx->submit_state.link.head;
6957
6958 ctx->drain_active = true;
6959 if (head) {
6960 /*
6961 * If we need to drain a request in the middle of a link, drain
6962 * the head request and the next request/link after the current
6963 * link. Considering sequential execution of links,
6964 * IOSQE_IO_DRAIN will be maintained for every request of our
6965 * link.
6966 */
6967 head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
6968 ctx->drain_next = true;
6969 }
6970}
6971
b16fed66
PB
6972static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6973 const struct io_uring_sqe *sqe)
282cdc86 6974 __must_hold(&ctx->uring_lock)
b16fed66 6975{
b16fed66 6976 unsigned int sqe_flags;
fc0ae024 6977 int personality;
4a04d1d1 6978 u8 opcode;
b16fed66 6979
864ea921 6980 /* req is partially pre-initialised, see io_preinit_req() */
4a04d1d1 6981 req->opcode = opcode = READ_ONCE(sqe->opcode);
b16fed66
PB
6982 /* same numerical values with corresponding REQ_F_*, safe to copy */
6983 req->flags = sqe_flags = READ_ONCE(sqe->flags);
6984 req->user_data = READ_ONCE(sqe->user_data);
b16fed66 6985 req->file = NULL;
b16fed66 6986 req->fixed_rsrc_refs = NULL;
b16fed66 6987 req->task = current;
b16fed66 6988
4a04d1d1
PB
6989 if (unlikely(opcode >= IORING_OP_LAST)) {
6990 req->opcode = 0;
b16fed66 6991 return -EINVAL;
4a04d1d1 6992 }
68fe256a
PB
6993 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
6994 /* enforce forwards compatibility on users */
6995 if (sqe_flags & ~SQE_VALID_FLAGS)
6996 return -EINVAL;
6997 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
4a04d1d1 6998 !io_op_defs[opcode].buffer_select)
68fe256a 6999 return -EOPNOTSUPP;
22b2ca31
PB
7000 if (sqe_flags & IOSQE_IO_DRAIN)
7001 io_init_req_drain(req);
2a56a9bd
PB
7002 }
7003 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
7004 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
7005 return -EACCES;
7006 /* knock it to the slow queue path, will be drained there */
7007 if (ctx->drain_active)
7008 req->flags |= REQ_F_FORCE_ASYNC;
7009 /* if there is no link, we're at "next" request and need to drain */
7010 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
7011 ctx->drain_next = false;
7012 ctx->drain_active = true;
22b2ca31 7013 req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
2a56a9bd 7014 }
68fe256a 7015 }
b16fed66 7016
4a04d1d1 7017 if (io_op_defs[opcode].needs_file) {
6d63416d
PB
7018 struct io_submit_state *state = &ctx->submit_state;
7019
7020 /*
7021 * Plug now if we have more than 2 IO left after this, and the
7022 * target is potentially a read/write to block based storage.
7023 */
4a04d1d1 7024 if (state->need_plug && io_op_defs[opcode].plug) {
6d63416d
PB
7025 state->plug_started = true;
7026 state->need_plug = false;
7027 blk_start_plug(&state->plug);
7028 }
7029
62906e89 7030 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
ac177053 7031 (sqe_flags & IOSQE_FIXED_FILE));
b16fed66 7032 if (unlikely(!req->file))
fc0ae024 7033 return -EBADF;
b16fed66 7034 }
fc0ae024 7035
4a04d1d1
PB
7036 personality = READ_ONCE(sqe->personality);
7037 if (personality) {
7038 req->creds = xa_load(&ctx->personalities, personality);
7039 if (!req->creds)
7040 return -EINVAL;
7041 get_cred(req->creds);
7042 req->flags |= REQ_F_CREDS;
7043 }
7044
fc0ae024 7045 return io_req_prep(req, sqe);
b16fed66
PB
7046}
7047
a6b8cadc 7048static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
a1ab7b35 7049 const struct io_uring_sqe *sqe)
282cdc86 7050 __must_hold(&ctx->uring_lock)
9e645e11 7051{
a1ab7b35 7052 struct io_submit_link *link = &ctx->submit_state.link;
ef4ff581 7053 int ret;
9e645e11 7054
a6b8cadc
PB
7055 ret = io_init_req(ctx, req, sqe);
7056 if (unlikely(ret)) {
a87acfde
JA
7057 trace_io_uring_req_failed(sqe, ret);
7058
a8295b98 7059 /* fail even hard links since we don't submit */
de59bc10 7060 if (link->head) {
a8295b98
HX
7061 /*
7062 * we can judge a link req is failed or cancelled by if
7063 * REQ_F_FAIL is set, but the head is an exception since
7064 * it may be set REQ_F_FAIL because of other req's failure
7065 * so let's leverage req->result to distinguish if a head
7066 * is set REQ_F_FAIL because of its failure or other req's
7067 * failure so that we can set the correct ret code for it.
7068 * init result here to avoid affecting the normal path.
7069 */
7070 if (!(link->head->flags & REQ_F_FAIL))
7071 req_fail_link_node(link->head, -ECANCELED);
7072 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
7073 /*
7074 * the current req is a normal req, we should return
7075 * error and thus break the submittion loop.
7076 */
7077 io_req_complete_failed(req, ret);
7078 return ret;
de59bc10 7079 }
a8295b98 7080 req_fail_link_node(req, ret);
a6b8cadc 7081 }
441b8a78 7082
be7053b7 7083 /* don't need @sqe from now on */
236daeae
OL
7084 trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
7085 req->flags, true,
7086 ctx->flags & IORING_SETUP_SQPOLL);
a6b8cadc 7087
9e645e11
JA
7088 /*
7089 * If we already have a head request, queue this one for async
7090 * submittal once the head completes. If we don't have a head but
7091 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7092 * submitted sync once the chain is complete. If none of those
7093 * conditions are true (normal request), then just queue it.
7094 */
863e0560
PB
7095 if (link->head) {
7096 struct io_kiocb *head = link->head;
4e88d6e7 7097
a8295b98
HX
7098 if (!(req->flags & REQ_F_FAIL)) {
7099 ret = io_req_prep_async(req);
7100 if (unlikely(ret)) {
7101 req_fail_link_node(req, ret);
7102 if (!(head->flags & REQ_F_FAIL))
7103 req_fail_link_node(head, -ECANCELED);
7104 }
7105 }
9d76377f 7106 trace_io_uring_link(ctx, req, head);
f2f87370 7107 link->last->link = req;
863e0560 7108 link->last = req;
32fe525b 7109
f15a3431
PB
7110 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
7111 return 0;
32fe525b 7112 /* last request of a link, enqueue the link */
f15a3431
PB
7113 link->head = NULL;
7114 req = head;
7115 } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
7116 link->head = req;
7117 link->last = req;
7118 return 0;
9e645e11 7119 }
2e6e1fde 7120
f15a3431 7121 io_queue_sqe(req);
1d4240cc 7122 return 0;
9e645e11
JA
7123}
7124
9a56a232
JA
7125/*
7126 * Batched submission is done, ensure local IO is flushed out.
7127 */
553deffd 7128static void io_submit_state_end(struct io_ring_ctx *ctx)
9a56a232 7129{
553deffd
PB
7130 struct io_submit_state *state = &ctx->submit_state;
7131
a1ab7b35 7132 if (state->link.head)
de59bc10 7133 io_queue_sqe(state->link.head);
553deffd 7134 /* flush only after queuing links as they can generate completions */
c450178d 7135 io_submit_flush_completions(ctx);
27926b68
JA
7136 if (state->plug_started)
7137 blk_finish_plug(&state->plug);
9a56a232
JA
7138}
7139
7140/*
7141 * Start submission side cache.
7142 */
7143static void io_submit_state_start(struct io_submit_state *state,
ba88ff11 7144 unsigned int max_ios)
9a56a232 7145{
27926b68 7146 state->plug_started = false;
4b628aeb 7147 state->need_plug = max_ios > 2;
a1ab7b35
PB
7148 /* set only head, no need to init link_last in advance */
7149 state->link.head = NULL;
9a56a232
JA
7150}
7151
2b188cc1
JA
7152static void io_commit_sqring(struct io_ring_ctx *ctx)
7153{
75b28aff 7154 struct io_rings *rings = ctx->rings;
2b188cc1 7155
caf582c6
PB
7156 /*
7157 * Ensure any loads from the SQEs are done at this point,
7158 * since once we write the new head, the application could
7159 * write new data to them.
7160 */
7161 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
7162}
7163
2b188cc1 7164/*
dd9ae8a0 7165 * Fetch an sqe, if one is available. Note this returns a pointer to memory
2b188cc1
JA
7166 * that is mapped by userspace. This means that care needs to be taken to
7167 * ensure that reads are stable, as we cannot rely on userspace always
7168 * being a good citizen. If members of the sqe are validated and then later
7169 * used, it's important that those reads are done through READ_ONCE() to
7170 * prevent a re-load down the line.
7171 */
709b302f 7172static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2b188cc1 7173{
ea5ab3b5 7174 unsigned head, mask = ctx->sq_entries - 1;
17d3aeb3 7175 unsigned sq_idx = ctx->cached_sq_head++ & mask;
2b188cc1
JA
7176
7177 /*
7178 * The cached sq head (or cq tail) serves two purposes:
7179 *
7180 * 1) allows us to batch the cost of updating the user visible
7181 * head updates.
7182 * 2) allows the kernel side to track the head on its own, even
7183 * though the application is the one updating it.
7184 */
17d3aeb3 7185 head = READ_ONCE(ctx->sq_array[sq_idx]);
709b302f
PB
7186 if (likely(head < ctx->sq_entries))
7187 return &ctx->sq_sqes[head];
2b188cc1
JA
7188
7189 /* drop invalid entries */
15641e42
PB
7190 ctx->cq_extra--;
7191 WRITE_ONCE(ctx->rings->sq_dropped,
7192 READ_ONCE(ctx->rings->sq_dropped) + 1);
709b302f
PB
7193 return NULL;
7194}
7195
0f212204 7196static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
282cdc86 7197 __must_hold(&ctx->uring_lock)
6c271ce2 7198{
69629809 7199 unsigned int entries = io_sqring_entries(ctx);
46c4e16a 7200 int submitted = 0;
6c271ce2 7201
51d48dab 7202 if (unlikely(!entries))
69629809 7203 return 0;
ee7d46d9 7204 /* make sure SQ entry isn't read before tail */
69629809 7205 nr = min3(nr, ctx->sq_entries, entries);
9a10867a 7206 io_get_task_refs(nr);
6c271ce2 7207
ba88ff11 7208 io_submit_state_start(&ctx->submit_state, nr);
69629809 7209 do {
3529d8c2 7210 const struct io_uring_sqe *sqe;
196be95c 7211 struct io_kiocb *req;
fb5ccc98 7212
a33ae9ce 7213 if (unlikely(!io_alloc_req_refill(ctx))) {
196be95c
PB
7214 if (!submitted)
7215 submitted = -EAGAIN;
fb5ccc98 7216 break;
196be95c 7217 }
a33ae9ce 7218 req = io_alloc_req(ctx);
4fccfcbb
PB
7219 sqe = io_get_sqe(ctx);
7220 if (unlikely(!sqe)) {
c2b6c6bc 7221 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
4fccfcbb
PB
7222 break;
7223 }
d3656344
JA
7224 /* will complete beyond this point, count as submitted */
7225 submitted++;
a1ab7b35 7226 if (io_submit_sqe(ctx, req, sqe))
196be95c 7227 break;
69629809 7228 } while (submitted < nr);
6c271ce2 7229
9466f437
PB
7230 if (unlikely(submitted != nr)) {
7231 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
d8a6df10 7232 int unused = nr - ref_used;
9466f437 7233
09899b19 7234 current->io_uring->cached_refs += unused;
9466f437 7235 }
6c271ce2 7236
553deffd 7237 io_submit_state_end(ctx);
ae9428ca
PB
7238 /* Commit SQ ring head once we've consumed and submitted all SQEs */
7239 io_commit_sqring(ctx);
7240
6c271ce2
JA
7241 return submitted;
7242}
7243
e4b6d902
PB
7244static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7245{
7246 return READ_ONCE(sqd->state);
7247}
7248
23b3628e
XW
7249static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7250{
7251 /* Tell userspace we may need a wakeup call */
79ebeaee 7252 spin_lock(&ctx->completion_lock);
20c0b380
NA
7253 WRITE_ONCE(ctx->rings->sq_flags,
7254 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
79ebeaee 7255 spin_unlock(&ctx->completion_lock);
23b3628e
XW
7256}
7257
7258static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
7259{
79ebeaee 7260 spin_lock(&ctx->completion_lock);
20c0b380
NA
7261 WRITE_ONCE(ctx->rings->sq_flags,
7262 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
79ebeaee 7263 spin_unlock(&ctx->completion_lock);
23b3628e
XW
7264}
7265
08369246 7266static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6c271ce2 7267{
c8d1ba58 7268 unsigned int to_submit;
bdcd3eab 7269 int ret = 0;
6c271ce2 7270
c8d1ba58 7271 to_submit = io_sqring_entries(ctx);
e95eee2d 7272 /* if we're handling multiple rings, cap submit size for fairness */
4ce8ad95
OL
7273 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7274 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
e95eee2d 7275
5eef4e87 7276 if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
948e1947
PB
7277 const struct cred *creds = NULL;
7278
7279 if (ctx->sq_creds != current_cred())
7280 creds = override_creds(ctx->sq_creds);
a4c0b3de 7281
c8d1ba58 7282 mutex_lock(&ctx->uring_lock);
5eef4e87 7283 if (!wq_list_empty(&ctx->iopoll_list))
5ba3c874 7284 io_do_iopoll(ctx, true);
906a3c6f 7285
3b763ba1
PB
7286 /*
7287 * Don't submit if refs are dying, good for io_uring_register(),
7288 * but also it is relied upon by io_ring_exit_work()
7289 */
0298ef96
PB
7290 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7291 !(ctx->flags & IORING_SETUP_R_DISABLED))
08369246 7292 ret = io_submit_sqes(ctx, to_submit);
c8d1ba58 7293 mutex_unlock(&ctx->uring_lock);
6c271ce2 7294
acfb381d
PB
7295 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7296 wake_up(&ctx->sqo_sq_wait);
948e1947
PB
7297 if (creds)
7298 revert_creds(creds);
acfb381d 7299 }
6c271ce2 7300
08369246
XW
7301 return ret;
7302}
6c271ce2 7303
c072481d 7304static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
08369246
XW
7305{
7306 struct io_ring_ctx *ctx;
7307 unsigned sq_thread_idle = 0;
6c271ce2 7308
c9dca27d
PB
7309 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7310 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
08369246 7311 sqd->sq_thread_idle = sq_thread_idle;
c8d1ba58 7312}
6c271ce2 7313
e4b6d902
PB
7314static bool io_sqd_handle_event(struct io_sq_data *sqd)
7315{
7316 bool did_sig = false;
7317 struct ksignal ksig;
7318
7319 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7320 signal_pending(current)) {
7321 mutex_unlock(&sqd->lock);
7322 if (signal_pending(current))
7323 did_sig = get_signal(&ksig);
7324 cond_resched();
7325 mutex_lock(&sqd->lock);
7326 }
e4b6d902
PB
7327 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7328}
7329
c8d1ba58
JA
7330static int io_sq_thread(void *data)
7331{
69fb2131
JA
7332 struct io_sq_data *sqd = data;
7333 struct io_ring_ctx *ctx;
a0d9205f 7334 unsigned long timeout = 0;
37d1e2e3 7335 char buf[TASK_COMM_LEN];
08369246 7336 DEFINE_WAIT(wait);
6c271ce2 7337
696ee88a 7338 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
37d1e2e3 7339 set_task_comm(current, buf);
37d1e2e3
JA
7340
7341 if (sqd->sq_cpu != -1)
7342 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
7343 else
7344 set_cpus_allowed_ptr(current, cpu_online_mask);
7345 current->flags |= PF_NO_SETAFFINITY;
7346
09a6f4ef 7347 mutex_lock(&sqd->lock);
e4b6d902 7348 while (1) {
1a924a80 7349 bool cap_entries, sqt_spin = false;
c1edbf5f 7350
e4b6d902
PB
7351 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
7352 if (io_sqd_handle_event(sqd))
c7d95613 7353 break;
08369246
XW
7354 timeout = jiffies + sqd->sq_thread_idle;
7355 }
e4b6d902 7356
e95eee2d 7357 cap_entries = !list_is_singular(&sqd->ctx_list);
69fb2131 7358 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
948e1947 7359 int ret = __io_sq_thread(ctx, cap_entries);
7c30f36a 7360
5eef4e87 7361 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
08369246 7362 sqt_spin = true;
69fb2131 7363 }
dd432ea5
PB
7364 if (io_run_task_work())
7365 sqt_spin = true;
6c271ce2 7366
08369246 7367 if (sqt_spin || !time_after(jiffies, timeout)) {
c8d1ba58 7368 cond_resched();
08369246
XW
7369 if (sqt_spin)
7370 timeout = jiffies + sqd->sq_thread_idle;
7371 continue;
7372 }
7373
08369246 7374 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
dd432ea5 7375 if (!io_sqd_events_pending(sqd) && !current->task_works) {
1a924a80
PB
7376 bool needs_sched = true;
7377
724cb4f9 7378 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
aaa9f0f4
PB
7379 io_ring_set_wakeup_flag(ctx);
7380
724cb4f9 7381 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
5eef4e87 7382 !wq_list_empty(&ctx->iopoll_list)) {
724cb4f9
HX
7383 needs_sched = false;
7384 break;
7385 }
7386 if (io_sqring_entries(ctx)) {
7387 needs_sched = false;
7388 break;
7389 }
7390 }
7391
7392 if (needs_sched) {
7393 mutex_unlock(&sqd->lock);
7394 schedule();
7395 mutex_lock(&sqd->lock);
7396 }
69fb2131
JA
7397 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7398 io_ring_clear_wakeup_flag(ctx);
6c271ce2 7399 }
08369246
XW
7400
7401 finish_wait(&sqd->wait, &wait);
7402 timeout = jiffies + sqd->sq_thread_idle;
6c271ce2 7403 }
28cea78a 7404
78cc687b 7405 io_uring_cancel_generic(true, sqd);
37d1e2e3 7406 sqd->thread = NULL;
05962f95 7407 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
5f3f26f9 7408 io_ring_set_wakeup_flag(ctx);
521d6a73 7409 io_run_task_work();
734551df
PB
7410 mutex_unlock(&sqd->lock);
7411
37d1e2e3
JA
7412 complete(&sqd->exited);
7413 do_exit(0);
6c271ce2
JA
7414}
7415
bda52162
JA
7416struct io_wait_queue {
7417 struct wait_queue_entry wq;
7418 struct io_ring_ctx *ctx;
5fd46178 7419 unsigned cq_tail;
bda52162
JA
7420 unsigned nr_timeouts;
7421};
7422
6c503150 7423static inline bool io_should_wake(struct io_wait_queue *iowq)
bda52162
JA
7424{
7425 struct io_ring_ctx *ctx = iowq->ctx;
5fd46178 7426 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
bda52162
JA
7427
7428 /*
d195a66e 7429 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
7430 * started waiting. For timeouts, we always want to return to userspace,
7431 * regardless of event count.
7432 */
5fd46178 7433 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
bda52162
JA
7434}
7435
7436static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7437 int wake_flags, void *key)
7438{
7439 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7440 wq);
7441
6c503150
PB
7442 /*
7443 * Cannot safely flush overflowed CQEs from here, ensure we wake up
7444 * the task, and the next invocation will do it.
7445 */
5ed7a37d 7446 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
6c503150
PB
7447 return autoremove_wake_function(curr, mode, wake_flags, key);
7448 return -1;
bda52162
JA
7449}
7450
af9c1a44
JA
7451static int io_run_task_work_sig(void)
7452{
7453 if (io_run_task_work())
7454 return 1;
7455 if (!signal_pending(current))
7456 return 0;
0b8cfa97 7457 if (test_thread_flag(TIF_NOTIFY_SIGNAL))
792ee0f6 7458 return -ERESTARTSYS;
af9c1a44
JA
7459 return -EINTR;
7460}
7461
eeb60b9a
PB
7462/* when returns >0, the caller should retry */
7463static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7464 struct io_wait_queue *iowq,
7465 signed long *timeout)
7466{
7467 int ret;
7468
7469 /* make sure we run task_work before checking for signals */
7470 ret = io_run_task_work_sig();
7471 if (ret || io_should_wake(iowq))
7472 return ret;
7473 /* let the caller flush overflows, retry */
5ed7a37d 7474 if (test_bit(0, &ctx->check_cq_overflow))
eeb60b9a
PB
7475 return 1;
7476
7477 *timeout = schedule_timeout(*timeout);
7478 return !*timeout ? -ETIME : 1;
7479}
7480
2b188cc1
JA
7481/*
7482 * Wait until events become available, if we don't already have some. The
7483 * application must reap them itself, as they reside on the shared cq ring.
7484 */
7485static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
c73ebb68
HX
7486 const sigset_t __user *sig, size_t sigsz,
7487 struct __kernel_timespec __user *uts)
2b188cc1 7488{
90291099 7489 struct io_wait_queue iowq;
75b28aff 7490 struct io_rings *rings = ctx->rings;
c1d5a224
PB
7491 signed long timeout = MAX_SCHEDULE_TIMEOUT;
7492 int ret;
2b188cc1 7493
b41e9852 7494 do {
90f67366 7495 io_cqring_overflow_flush(ctx);
6c503150 7496 if (io_cqring_events(ctx) >= min_events)
b41e9852 7497 return 0;
4c6e277c 7498 if (!io_run_task_work())
b41e9852 7499 break;
b41e9852 7500 } while (1);
2b188cc1 7501
44df58d4
XW
7502 if (uts) {
7503 struct timespec64 ts;
7504
7505 if (get_timespec64(&ts, uts))
7506 return -EFAULT;
7507 timeout = timespec64_to_jiffies(&ts);
7508 }
7509
2b188cc1 7510 if (sig) {
9e75ad5d
AB
7511#ifdef CONFIG_COMPAT
7512 if (in_compat_syscall())
7513 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 7514 sigsz);
9e75ad5d
AB
7515 else
7516#endif
b772434b 7517 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 7518
2b188cc1
JA
7519 if (ret)
7520 return ret;
7521 }
7522
90291099
PB
7523 init_waitqueue_func_entry(&iowq.wq, io_wake_function);
7524 iowq.wq.private = current;
7525 INIT_LIST_HEAD(&iowq.wq.entry);
7526 iowq.ctx = ctx;
bda52162 7527 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
5fd46178 7528 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
90291099 7529
c826bd7a 7530 trace_io_uring_cqring_wait(ctx, min_events);
bda52162 7531 do {
ca0a2651 7532 /* if we can't even flush overflow, don't wait for more */
90f67366 7533 if (!io_cqring_overflow_flush(ctx)) {
ca0a2651
JA
7534 ret = -EBUSY;
7535 break;
7536 }
311997b3 7537 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
bda52162 7538 TASK_INTERRUPTIBLE);
eeb60b9a 7539 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
311997b3 7540 finish_wait(&ctx->cq_wait, &iowq.wq);
ca0a2651 7541 cond_resched();
eeb60b9a 7542 } while (ret > 0);
bda52162 7543
b7db41c9 7544 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 7545
75b28aff 7546 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
7547}
7548
9123c8ff 7549static void io_free_page_table(void **table, size_t size)
05f3fb3c 7550{
9123c8ff 7551 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
05f3fb3c 7552
846a4ef2 7553 for (i = 0; i < nr_tables; i++)
9123c8ff
PB
7554 kfree(table[i]);
7555 kfree(table);
7556}
7557
c072481d 7558static __cold void **io_alloc_page_table(size_t size)
9123c8ff
PB
7559{
7560 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7561 size_t init_size = size;
7562 void **table;
7563
0bea96f5 7564 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
9123c8ff
PB
7565 if (!table)
7566 return NULL;
7567
7568 for (i = 0; i < nr_tables; i++) {
27f6b318 7569 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
9123c8ff 7570
0bea96f5 7571 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
9123c8ff
PB
7572 if (!table[i]) {
7573 io_free_page_table(table, init_size);
7574 return NULL;
7575 }
7576 size -= this_size;
7577 }
7578 return table;
05f3fb3c
JA
7579}
7580
28a9fe25 7581static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
1642b445 7582{
28a9fe25
PB
7583 percpu_ref_exit(&ref_node->refs);
7584 kfree(ref_node);
1642b445
PB
7585}
7586
c072481d 7587static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
b9bd2bea
PB
7588{
7589 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
7590 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
7591 unsigned long flags;
7592 bool first_add = false;
7593
7594 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
7595 node->done = true;
7596
7597 while (!list_empty(&ctx->rsrc_ref_list)) {
7598 node = list_first_entry(&ctx->rsrc_ref_list,
7599 struct io_rsrc_node, node);
7600 /* recycle ref nodes in order */
7601 if (!node->done)
7602 break;
7603 list_del(&node->node);
7604 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
7605 }
7606 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
7607
7608 if (first_add)
7609 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
7610}
7611
7612static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
7613{
7614 struct io_rsrc_node *ref_node;
7615
7616 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7617 if (!ref_node)
7618 return NULL;
7619
7620 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7621 0, GFP_KERNEL)) {
7622 kfree(ref_node);
7623 return NULL;
7624 }
7625 INIT_LIST_HEAD(&ref_node->node);
7626 INIT_LIST_HEAD(&ref_node->rsrc_list);
7627 ref_node->done = false;
7628 return ref_node;
7629}
7630
a7f0ed5a
PB
7631static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
7632 struct io_rsrc_data *data_to_kill)
6b06314c 7633{
a7f0ed5a
PB
7634 WARN_ON_ONCE(!ctx->rsrc_backup_node);
7635 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
6b06314c 7636
a7f0ed5a
PB
7637 if (data_to_kill) {
7638 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
82fbcfa9 7639
a7f0ed5a 7640 rsrc_node->rsrc_data = data_to_kill;
4956b9ea 7641 spin_lock_irq(&ctx->rsrc_ref_lock);
a7f0ed5a 7642 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
4956b9ea 7643 spin_unlock_irq(&ctx->rsrc_ref_lock);
82fbcfa9 7644
3e942498 7645 atomic_inc(&data_to_kill->refs);
a7f0ed5a
PB
7646 percpu_ref_kill(&rsrc_node->refs);
7647 ctx->rsrc_node = NULL;
7648 }
6b06314c 7649
a7f0ed5a
PB
7650 if (!ctx->rsrc_node) {
7651 ctx->rsrc_node = ctx->rsrc_backup_node;
7652 ctx->rsrc_backup_node = NULL;
7653 }
8bad28d8
HX
7654}
7655
a7f0ed5a 7656static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
8dd03afe
PB
7657{
7658 if (ctx->rsrc_backup_node)
7659 return 0;
b895c9a6 7660 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
8dd03afe 7661 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
8bad28d8
HX
7662}
7663
c072481d
PB
7664static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
7665 struct io_ring_ctx *ctx)
8bad28d8
HX
7666{
7667 int ret;
05589553 7668
215c3902 7669 /* As we may drop ->uring_lock, other task may have started quiesce */
8bad28d8
HX
7670 if (data->quiesce)
7671 return -ENXIO;
05589553 7672
8bad28d8 7673 data->quiesce = true;
1ffc5422 7674 do {
a7f0ed5a 7675 ret = io_rsrc_node_switch_start(ctx);
8dd03afe 7676 if (ret)
f2303b1f 7677 break;
a7f0ed5a 7678 io_rsrc_node_switch(ctx, data);
f2303b1f 7679
3e942498
PB
7680 /* kill initial ref, already quiesced if zero */
7681 if (atomic_dec_and_test(&data->refs))
7682 break;
c018db4a 7683 mutex_unlock(&ctx->uring_lock);
8bad28d8 7684 flush_delayed_work(&ctx->rsrc_put_work);
1ffc5422 7685 ret = wait_for_completion_interruptible(&data->done);
c018db4a
JA
7686 if (!ret) {
7687 mutex_lock(&ctx->uring_lock);
1ffc5422 7688 break;
c018db4a 7689 }
8bad28d8 7690
3e942498
PB
7691 atomic_inc(&data->refs);
7692 /* wait for all works potentially completing data->done */
7693 flush_delayed_work(&ctx->rsrc_put_work);
cb5e1b81 7694 reinit_completion(&data->done);
8dd03afe 7695
1ffc5422 7696 ret = io_run_task_work_sig();
8bad28d8 7697 mutex_lock(&ctx->uring_lock);
f2303b1f 7698 } while (ret >= 0);
8bad28d8 7699 data->quiesce = false;
05f3fb3c 7700
8bad28d8 7701 return ret;
d7954b2b
BM
7702}
7703
2d091d62
PB
7704static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
7705{
7706 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
7707 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
7708
7709 return &data->tags[table_idx][off];
7710}
7711
44b31f2f 7712static void io_rsrc_data_free(struct io_rsrc_data *data)
1ad555c6 7713{
2d091d62
PB
7714 size_t size = data->nr * sizeof(data->tags[0][0]);
7715
7716 if (data->tags)
7717 io_free_page_table((void **)data->tags, size);
44b31f2f
PB
7718 kfree(data);
7719}
7720
c072481d
PB
7721static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
7722 u64 __user *utags, unsigned nr,
7723 struct io_rsrc_data **pdata)
1ad555c6 7724{
b895c9a6 7725 struct io_rsrc_data *data;
2d091d62 7726 int ret = -ENOMEM;
d878c816 7727 unsigned i;
1ad555c6
BM
7728
7729 data = kzalloc(sizeof(*data), GFP_KERNEL);
7730 if (!data)
d878c816 7731 return -ENOMEM;
2d091d62 7732 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
b60c8dce 7733 if (!data->tags) {
1ad555c6 7734 kfree(data);
d878c816
PB
7735 return -ENOMEM;
7736 }
2d091d62
PB
7737
7738 data->nr = nr;
7739 data->ctx = ctx;
7740 data->do_put = do_put;
d878c816 7741 if (utags) {
2d091d62 7742 ret = -EFAULT;
d878c816 7743 for (i = 0; i < nr; i++) {
fdd1dc31
CIK
7744 u64 *tag_slot = io_get_tag_slot(data, i);
7745
7746 if (copy_from_user(tag_slot, &utags[i],
7747 sizeof(*tag_slot)))
2d091d62 7748 goto fail;
d878c816 7749 }
1ad555c6 7750 }
b60c8dce 7751
3e942498 7752 atomic_set(&data->refs, 1);
1ad555c6 7753 init_completion(&data->done);
d878c816
PB
7754 *pdata = data;
7755 return 0;
2d091d62
PB
7756fail:
7757 io_rsrc_data_free(data);
7758 return ret;
1ad555c6
BM
7759}
7760
9123c8ff
PB
7761static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
7762{
0bea96f5
PB
7763 table->files = kvcalloc(nr_files, sizeof(table->files[0]),
7764 GFP_KERNEL_ACCOUNT);
9123c8ff
PB
7765 return !!table->files;
7766}
7767
042b0d85 7768static void io_free_file_tables(struct io_file_table *table)
9123c8ff 7769{
042b0d85 7770 kvfree(table->files);
9123c8ff
PB
7771 table->files = NULL;
7772}
7773
fff4db76 7774static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
1ad555c6 7775{
fff4db76
PB
7776#if defined(CONFIG_UNIX)
7777 if (ctx->ring_sock) {
7778 struct sock *sock = ctx->ring_sock->sk;
7779 struct sk_buff *skb;
7780
7781 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7782 kfree_skb(skb);
7783 }
7784#else
7785 int i;
7786
7787 for (i = 0; i < ctx->nr_user_files; i++) {
7788 struct file *file;
7789
7790 file = io_file_from_index(ctx, i);
7791 if (file)
7792 fput(file);
7793 }
7794#endif
042b0d85 7795 io_free_file_tables(&ctx->file_table);
44b31f2f 7796 io_rsrc_data_free(ctx->file_data);
fff4db76
PB
7797 ctx->file_data = NULL;
7798 ctx->nr_user_files = 0;
1ad555c6
BM
7799}
7800
d7954b2b
BM
7801static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7802{
d7954b2b
BM
7803 int ret;
7804
08480400 7805 if (!ctx->file_data)
d7954b2b 7806 return -ENXIO;
08480400
PB
7807 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
7808 if (!ret)
7809 __io_sqe_files_unregister(ctx);
7810 return ret;
6b06314c
JA
7811}
7812
37d1e2e3 7813static void io_sq_thread_unpark(struct io_sq_data *sqd)
09a6f4ef 7814 __releases(&sqd->lock)
37d1e2e3 7815{
521d6a73
PB
7816 WARN_ON_ONCE(sqd->thread == current);
7817
9e138a48
PB
7818 /*
7819 * Do the dance but not conditional clear_bit() because it'd race with
7820 * other threads incrementing park_pending and setting the bit.
7821 */
37d1e2e3 7822 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
9e138a48
PB
7823 if (atomic_dec_return(&sqd->park_pending))
7824 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 7825 mutex_unlock(&sqd->lock);
37d1e2e3
JA
7826}
7827
86e0d676 7828static void io_sq_thread_park(struct io_sq_data *sqd)
09a6f4ef 7829 __acquires(&sqd->lock)
37d1e2e3 7830{
521d6a73
PB
7831 WARN_ON_ONCE(sqd->thread == current);
7832
9e138a48 7833 atomic_inc(&sqd->park_pending);
86e0d676 7834 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 7835 mutex_lock(&sqd->lock);
05962f95 7836 if (sqd->thread)
86e0d676 7837 wake_up_process(sqd->thread);
37d1e2e3
JA
7838}
7839
7840static void io_sq_thread_stop(struct io_sq_data *sqd)
7841{
521d6a73 7842 WARN_ON_ONCE(sqd->thread == current);
88885f66 7843 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
521d6a73 7844
05962f95 7845 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
88885f66 7846 mutex_lock(&sqd->lock);
e8f98f24
JA
7847 if (sqd->thread)
7848 wake_up_process(sqd->thread);
09a6f4ef 7849 mutex_unlock(&sqd->lock);
05962f95 7850 wait_for_completion(&sqd->exited);
37d1e2e3
JA
7851}
7852
534ca6d6 7853static void io_put_sq_data(struct io_sq_data *sqd)
6c271ce2 7854{
534ca6d6 7855 if (refcount_dec_and_test(&sqd->refs)) {
9e138a48
PB
7856 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
7857
37d1e2e3
JA
7858 io_sq_thread_stop(sqd);
7859 kfree(sqd);
7860 }
7861}
7862
7863static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7864{
7865 struct io_sq_data *sqd = ctx->sq_data;
7866
7867 if (sqd) {
05962f95 7868 io_sq_thread_park(sqd);
521d6a73 7869 list_del_init(&ctx->sqd_list);
37d1e2e3 7870 io_sqd_update_thread_idle(sqd);
05962f95 7871 io_sq_thread_unpark(sqd);
37d1e2e3
JA
7872
7873 io_put_sq_data(sqd);
7874 ctx->sq_data = NULL;
534ca6d6
JA
7875 }
7876}
7877
aa06165d
JA
7878static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7879{
7880 struct io_ring_ctx *ctx_attach;
7881 struct io_sq_data *sqd;
7882 struct fd f;
7883
7884 f = fdget(p->wq_fd);
7885 if (!f.file)
7886 return ERR_PTR(-ENXIO);
7887 if (f.file->f_op != &io_uring_fops) {
7888 fdput(f);
7889 return ERR_PTR(-EINVAL);
7890 }
7891
7892 ctx_attach = f.file->private_data;
7893 sqd = ctx_attach->sq_data;
7894 if (!sqd) {
7895 fdput(f);
7896 return ERR_PTR(-EINVAL);
7897 }
5c2469e0
JA
7898 if (sqd->task_tgid != current->tgid) {
7899 fdput(f);
7900 return ERR_PTR(-EPERM);
7901 }
aa06165d
JA
7902
7903 refcount_inc(&sqd->refs);
7904 fdput(f);
7905 return sqd;
7906}
7907
26984fbf
PB
7908static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
7909 bool *attached)
534ca6d6
JA
7910{
7911 struct io_sq_data *sqd;
7912
26984fbf 7913 *attached = false;
5c2469e0
JA
7914 if (p->flags & IORING_SETUP_ATTACH_WQ) {
7915 sqd = io_attach_sq_data(p);
26984fbf
PB
7916 if (!IS_ERR(sqd)) {
7917 *attached = true;
5c2469e0 7918 return sqd;
26984fbf 7919 }
5c2469e0
JA
7920 /* fall through for EPERM case, setup new sqd/task */
7921 if (PTR_ERR(sqd) != -EPERM)
7922 return sqd;
7923 }
aa06165d 7924
534ca6d6
JA
7925 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7926 if (!sqd)
7927 return ERR_PTR(-ENOMEM);
7928
9e138a48 7929 atomic_set(&sqd->park_pending, 0);
534ca6d6 7930 refcount_set(&sqd->refs, 1);
69fb2131 7931 INIT_LIST_HEAD(&sqd->ctx_list);
09a6f4ef 7932 mutex_init(&sqd->lock);
534ca6d6 7933 init_waitqueue_head(&sqd->wait);
37d1e2e3 7934 init_completion(&sqd->exited);
534ca6d6
JA
7935 return sqd;
7936}
7937
6b06314c 7938#if defined(CONFIG_UNIX)
6b06314c
JA
7939/*
7940 * Ensure the UNIX gc is aware of our file set, so we are certain that
7941 * the io_uring can be safely unregistered on process exit, even if we have
7942 * loops in the file referencing.
7943 */
7944static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7945{
7946 struct sock *sk = ctx->ring_sock->sk;
7947 struct scm_fp_list *fpl;
7948 struct sk_buff *skb;
08a45173 7949 int i, nr_files;
6b06314c 7950
6b06314c
JA
7951 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7952 if (!fpl)
7953 return -ENOMEM;
7954
7955 skb = alloc_skb(0, GFP_KERNEL);
7956 if (!skb) {
7957 kfree(fpl);
7958 return -ENOMEM;
7959 }
7960
7961 skb->sk = sk;
6b06314c 7962
08a45173 7963 nr_files = 0;
62e398be 7964 fpl->user = get_uid(current_user());
6b06314c 7965 for (i = 0; i < nr; i++) {
65e19f54
JA
7966 struct file *file = io_file_from_index(ctx, i + offset);
7967
7968 if (!file)
08a45173 7969 continue;
65e19f54 7970 fpl->fp[nr_files] = get_file(file);
08a45173
JA
7971 unix_inflight(fpl->user, fpl->fp[nr_files]);
7972 nr_files++;
6b06314c
JA
7973 }
7974
08a45173
JA
7975 if (nr_files) {
7976 fpl->max = SCM_MAX_FD;
7977 fpl->count = nr_files;
7978 UNIXCB(skb).fp = fpl;
05f3fb3c 7979 skb->destructor = unix_destruct_scm;
08a45173
JA
7980 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7981 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 7982
08a45173
JA
7983 for (i = 0; i < nr_files; i++)
7984 fput(fpl->fp[i]);
7985 } else {
7986 kfree_skb(skb);
7987 kfree(fpl);
7988 }
6b06314c
JA
7989
7990 return 0;
7991}
7992
7993/*
7994 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7995 * causes regular reference counting to break down. We rely on the UNIX
7996 * garbage collection to take care of this problem for us.
7997 */
7998static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7999{
8000 unsigned left, total;
8001 int ret = 0;
8002
8003 total = 0;
8004 left = ctx->nr_user_files;
8005 while (left) {
8006 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
8007
8008 ret = __io_sqe_files_scm(ctx, this_files, total);
8009 if (ret)
8010 break;
8011 left -= this_files;
8012 total += this_files;
8013 }
8014
8015 if (!ret)
8016 return 0;
8017
8018 while (total < ctx->nr_user_files) {
65e19f54
JA
8019 struct file *file = io_file_from_index(ctx, total);
8020
8021 if (file)
8022 fput(file);
6b06314c
JA
8023 total++;
8024 }
8025
8026 return ret;
8027}
8028#else
8029static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8030{
8031 return 0;
8032}
8033#endif
8034
47e90392 8035static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
05f3fb3c 8036{
50238531 8037 struct file *file = prsrc->file;
05f3fb3c
JA
8038#if defined(CONFIG_UNIX)
8039 struct sock *sock = ctx->ring_sock->sk;
8040 struct sk_buff_head list, *head = &sock->sk_receive_queue;
8041 struct sk_buff *skb;
8042 int i;
8043
8044 __skb_queue_head_init(&list);
8045
8046 /*
8047 * Find the skb that holds this file in its SCM_RIGHTS. When found,
8048 * remove this entry and rearrange the file array.
8049 */
8050 skb = skb_dequeue(head);
8051 while (skb) {
8052 struct scm_fp_list *fp;
8053
8054 fp = UNIXCB(skb).fp;
8055 for (i = 0; i < fp->count; i++) {
8056 int left;
8057
8058 if (fp->fp[i] != file)
8059 continue;
8060
8061 unix_notinflight(fp->user, fp->fp[i]);
8062 left = fp->count - 1 - i;
8063 if (left) {
8064 memmove(&fp->fp[i], &fp->fp[i + 1],
8065 left * sizeof(struct file *));
8066 }
8067 fp->count--;
8068 if (!fp->count) {
8069 kfree_skb(skb);
8070 skb = NULL;
8071 } else {
8072 __skb_queue_tail(&list, skb);
8073 }
8074 fput(file);
8075 file = NULL;
8076 break;
8077 }
8078
8079 if (!file)
8080 break;
8081
8082 __skb_queue_tail(&list, skb);
8083
8084 skb = skb_dequeue(head);
8085 }
8086
8087 if (skb_peek(&list)) {
8088 spin_lock_irq(&head->lock);
8089 while ((skb = __skb_dequeue(&list)) != NULL)
8090 __skb_queue_tail(head, skb);
8091 spin_unlock_irq(&head->lock);
8092 }
8093#else
8094 fput(file);
8095#endif
8096}
8097
b895c9a6 8098static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
65e19f54 8099{
b895c9a6 8100 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
269bbe5f
BM
8101 struct io_ring_ctx *ctx = rsrc_data->ctx;
8102 struct io_rsrc_put *prsrc, *tmp;
05589553 8103
269bbe5f
BM
8104 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8105 list_del(&prsrc->list);
b60c8dce
PB
8106
8107 if (prsrc->tag) {
8108 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
b60c8dce
PB
8109
8110 io_ring_submit_lock(ctx, lock_ring);
79ebeaee 8111 spin_lock(&ctx->completion_lock);
b60c8dce 8112 io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
2840f710 8113 ctx->cq_extra++;
b60c8dce 8114 io_commit_cqring(ctx);
79ebeaee 8115 spin_unlock(&ctx->completion_lock);
b60c8dce
PB
8116 io_cqring_ev_posted(ctx);
8117 io_ring_submit_unlock(ctx, lock_ring);
8118 }
8119
40ae0ff7 8120 rsrc_data->do_put(ctx, prsrc);
269bbe5f 8121 kfree(prsrc);
65e19f54 8122 }
05589553 8123
28a9fe25 8124 io_rsrc_node_destroy(ref_node);
3e942498
PB
8125 if (atomic_dec_and_test(&rsrc_data->refs))
8126 complete(&rsrc_data->done);
2faf852d 8127}
65e19f54 8128
269bbe5f 8129static void io_rsrc_put_work(struct work_struct *work)
4a38aed2
JA
8130{
8131 struct io_ring_ctx *ctx;
8132 struct llist_node *node;
8133
269bbe5f
BM
8134 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
8135 node = llist_del_all(&ctx->rsrc_put_llist);
4a38aed2
JA
8136
8137 while (node) {
b895c9a6 8138 struct io_rsrc_node *ref_node;
4a38aed2
JA
8139 struct llist_node *next = node->next;
8140
b895c9a6 8141 ref_node = llist_entry(node, struct io_rsrc_node, llist);
269bbe5f 8142 __io_rsrc_put_work(ref_node);
4a38aed2
JA
8143 node = next;
8144 }
8145}
8146
6b06314c 8147static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
792e3582 8148 unsigned nr_args, u64 __user *tags)
6b06314c
JA
8149{
8150 __s32 __user *fds = (__s32 __user *) arg;
05f3fb3c 8151 struct file *file;
f3baed39 8152 int fd, ret;
846a4ef2 8153 unsigned i;
6b06314c 8154
05f3fb3c 8155 if (ctx->file_data)
6b06314c
JA
8156 return -EBUSY;
8157 if (!nr_args)
8158 return -EINVAL;
8159 if (nr_args > IORING_MAX_FIXED_FILES)
8160 return -EMFILE;
3a1b8a4e
PB
8161 if (nr_args > rlimit(RLIMIT_NOFILE))
8162 return -EMFILE;
a7f0ed5a 8163 ret = io_rsrc_node_switch_start(ctx);
f3baed39
PB
8164 if (ret)
8165 return ret;
d878c816
PB
8166 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
8167 &ctx->file_data);
8168 if (ret)
8169 return ret;
6b06314c 8170
f3baed39 8171 ret = -ENOMEM;
aeca241b 8172 if (!io_alloc_file_tables(&ctx->file_table, nr_args))
1ad555c6 8173 goto out_free;
65e19f54 8174
08a45173 8175 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
d878c816 8176 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
600cf3f8
PB
8177 ret = -EFAULT;
8178 goto out_fput;
8179 }
08a45173 8180 /* allow sparse sets */
792e3582
PB
8181 if (fd == -1) {
8182 ret = -EINVAL;
2d091d62 8183 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
792e3582 8184 goto out_fput;
08a45173 8185 continue;
792e3582 8186 }
6b06314c 8187
05f3fb3c 8188 file = fget(fd);
6b06314c 8189 ret = -EBADF;
792e3582 8190 if (unlikely(!file))
600cf3f8 8191 goto out_fput;
05f3fb3c 8192
6b06314c
JA
8193 /*
8194 * Don't allow io_uring instances to be registered. If UNIX
8195 * isn't enabled, then this causes a reference cycle and this
8196 * instance can never get freed. If UNIX is enabled we'll
8197 * handle it just fine, but there's still no point in allowing
8198 * a ring fd as it doesn't support regular read/write anyway.
8199 */
05f3fb3c
JA
8200 if (file->f_op == &io_uring_fops) {
8201 fput(file);
600cf3f8 8202 goto out_fput;
6b06314c 8203 }
aeca241b 8204 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
6b06314c
JA
8205 }
8206
6b06314c 8207 ret = io_sqe_files_scm(ctx);
05589553 8208 if (ret) {
08480400 8209 __io_sqe_files_unregister(ctx);
05589553
XW
8210 return ret;
8211 }
6b06314c 8212
a7f0ed5a 8213 io_rsrc_node_switch(ctx, NULL);
6b06314c 8214 return ret;
600cf3f8
PB
8215out_fput:
8216 for (i = 0; i < ctx->nr_user_files; i++) {
8217 file = io_file_from_index(ctx, i);
8218 if (file)
8219 fput(file);
8220 }
042b0d85 8221 io_free_file_tables(&ctx->file_table);
600cf3f8 8222 ctx->nr_user_files = 0;
600cf3f8 8223out_free:
44b31f2f 8224 io_rsrc_data_free(ctx->file_data);
55cbc256 8225 ctx->file_data = NULL;
6b06314c
JA
8226 return ret;
8227}
8228
c3a31e60
JA
8229static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
8230 int index)
8231{
8232#if defined(CONFIG_UNIX)
8233 struct sock *sock = ctx->ring_sock->sk;
8234 struct sk_buff_head *head = &sock->sk_receive_queue;
8235 struct sk_buff *skb;
8236
8237 /*
8238 * See if we can merge this file into an existing skb SCM_RIGHTS
8239 * file set. If there's no room, fall back to allocating a new skb
8240 * and filling it in.
8241 */
8242 spin_lock_irq(&head->lock);
8243 skb = skb_peek(head);
8244 if (skb) {
8245 struct scm_fp_list *fpl = UNIXCB(skb).fp;
8246
8247 if (fpl->count < SCM_MAX_FD) {
8248 __skb_unlink(skb, head);
8249 spin_unlock_irq(&head->lock);
8250 fpl->fp[fpl->count] = get_file(file);
8251 unix_inflight(fpl->user, fpl->fp[fpl->count]);
8252 fpl->count++;
8253 spin_lock_irq(&head->lock);
8254 __skb_queue_head(head, skb);
8255 } else {
8256 skb = NULL;
8257 }
8258 }
8259 spin_unlock_irq(&head->lock);
8260
8261 if (skb) {
8262 fput(file);
8263 return 0;
8264 }
8265
8266 return __io_sqe_files_scm(ctx, 1, index);
8267#else
8268 return 0;
8269#endif
8270}
8271
9c7b0ba8
PB
8272static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8273 struct io_rsrc_node *node, void *rsrc)
8274{
8275 struct io_rsrc_put *prsrc;
8276
8277 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8278 if (!prsrc)
8279 return -ENOMEM;
8280
8281 prsrc->tag = *io_get_tag_slot(data, idx);
8282 prsrc->rsrc = rsrc;
8283 list_add(&prsrc->list, &node->rsrc_list);
8284 return 0;
8285}
8286
b9445598
PB
8287static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8288 unsigned int issue_flags, u32 slot_index)
8289{
8290 struct io_ring_ctx *ctx = req->ctx;
8291 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
9c7b0ba8 8292 bool needs_switch = false;
b9445598
PB
8293 struct io_fixed_file *file_slot;
8294 int ret = -EBADF;
8295
8296 io_ring_submit_lock(ctx, !force_nonblock);
8297 if (file->f_op == &io_uring_fops)
8298 goto err;
8299 ret = -ENXIO;
8300 if (!ctx->file_data)
8301 goto err;
8302 ret = -EINVAL;
8303 if (slot_index >= ctx->nr_user_files)
8304 goto err;
8305
8306 slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8307 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
9c7b0ba8
PB
8308
8309 if (file_slot->file_ptr) {
8310 struct file *old_file;
8311
8312 ret = io_rsrc_node_switch_start(ctx);
8313 if (ret)
8314 goto err;
8315
8316 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8317 ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
8318 ctx->rsrc_node, old_file);
8319 if (ret)
8320 goto err;
8321 file_slot->file_ptr = 0;
8322 needs_switch = true;
8323 }
b9445598
PB
8324
8325 *io_get_tag_slot(ctx->file_data, slot_index) = 0;
8326 io_fixed_file_set(file_slot, file);
8327 ret = io_sqe_file_register(ctx, file, slot_index);
8328 if (ret) {
8329 file_slot->file_ptr = 0;
8330 goto err;
8331 }
8332
8333 ret = 0;
8334err:
9c7b0ba8
PB
8335 if (needs_switch)
8336 io_rsrc_node_switch(ctx, ctx->file_data);
b9445598
PB
8337 io_ring_submit_unlock(ctx, !force_nonblock);
8338 if (ret)
8339 fput(file);
8340 return ret;
8341}
8342
7df778be
PB
8343static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
8344{
8345 unsigned int offset = req->close.file_slot - 1;
8346 struct io_ring_ctx *ctx = req->ctx;
8347 struct io_fixed_file *file_slot;
8348 struct file *file;
8349 int ret, i;
8350
8351 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
8352 ret = -ENXIO;
8353 if (unlikely(!ctx->file_data))
8354 goto out;
8355 ret = -EINVAL;
8356 if (offset >= ctx->nr_user_files)
8357 goto out;
8358 ret = io_rsrc_node_switch_start(ctx);
8359 if (ret)
8360 goto out;
8361
8362 i = array_index_nospec(offset, ctx->nr_user_files);
8363 file_slot = io_fixed_file_slot(&ctx->file_table, i);
8364 ret = -EBADF;
8365 if (!file_slot->file_ptr)
8366 goto out;
8367
8368 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8369 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
8370 if (ret)
8371 goto out;
8372
8373 file_slot->file_ptr = 0;
8374 io_rsrc_node_switch(ctx, ctx->file_data);
8375 ret = 0;
8376out:
8377 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
8378 return ret;
8379}
8380
05f3fb3c 8381static int __io_sqe_files_update(struct io_ring_ctx *ctx,
c3bdad02 8382 struct io_uring_rsrc_update2 *up,
05f3fb3c
JA
8383 unsigned nr_args)
8384{
c3bdad02 8385 u64 __user *tags = u64_to_user_ptr(up->tags);
98f0b3b4 8386 __s32 __user *fds = u64_to_user_ptr(up->data);
b895c9a6 8387 struct io_rsrc_data *data = ctx->file_data;
a04b0ac0
PB
8388 struct io_fixed_file *file_slot;
8389 struct file *file;
98f0b3b4
PB
8390 int fd, i, err = 0;
8391 unsigned int done;
05589553 8392 bool needs_switch = false;
c3a31e60 8393
98f0b3b4
PB
8394 if (!ctx->file_data)
8395 return -ENXIO;
8396 if (up->offset + nr_args > ctx->nr_user_files)
c3a31e60
JA
8397 return -EINVAL;
8398
67973b93 8399 for (done = 0; done < nr_args; done++) {
c3bdad02
PB
8400 u64 tag = 0;
8401
8402 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
8403 copy_from_user(&fd, &fds[done], sizeof(fd))) {
c3a31e60
JA
8404 err = -EFAULT;
8405 break;
8406 }
c3bdad02
PB
8407 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
8408 err = -EINVAL;
8409 break;
8410 }
4e0377a1 8411 if (fd == IORING_REGISTER_FILES_SKIP)
8412 continue;
8413
67973b93 8414 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
aeca241b 8415 file_slot = io_fixed_file_slot(&ctx->file_table, i);
ea64ec02 8416
a04b0ac0
PB
8417 if (file_slot->file_ptr) {
8418 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
b60c8dce
PB
8419 err = io_queue_rsrc_removal(data, up->offset + done,
8420 ctx->rsrc_node, file);
a5318d3c
HD
8421 if (err)
8422 break;
a04b0ac0 8423 file_slot->file_ptr = 0;
05589553 8424 needs_switch = true;
c3a31e60
JA
8425 }
8426 if (fd != -1) {
c3a31e60
JA
8427 file = fget(fd);
8428 if (!file) {
8429 err = -EBADF;
8430 break;
8431 }
8432 /*
8433 * Don't allow io_uring instances to be registered. If
8434 * UNIX isn't enabled, then this causes a reference
8435 * cycle and this instance can never get freed. If UNIX
8436 * is enabled we'll handle it just fine, but there's
8437 * still no point in allowing a ring fd as it doesn't
8438 * support regular read/write anyway.
8439 */
8440 if (file->f_op == &io_uring_fops) {
8441 fput(file);
8442 err = -EBADF;
8443 break;
8444 }
2d091d62 8445 *io_get_tag_slot(data, up->offset + done) = tag;
9a321c98 8446 io_fixed_file_set(file_slot, file);
c3a31e60 8447 err = io_sqe_file_register(ctx, file, i);
f3bd9dae 8448 if (err) {
a04b0ac0 8449 file_slot->file_ptr = 0;
f3bd9dae 8450 fput(file);
c3a31e60 8451 break;
f3bd9dae 8452 }
c3a31e60 8453 }
05f3fb3c
JA
8454 }
8455
a7f0ed5a
PB
8456 if (needs_switch)
8457 io_rsrc_node_switch(ctx, data);
c3a31e60
JA
8458 return done ? done : err;
8459}
05589553 8460
685fe7fe
JA
8461static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
8462 struct task_struct *task)
24369c2e 8463{
e941894e 8464 struct io_wq_hash *hash;
24369c2e 8465 struct io_wq_data data;
24369c2e 8466 unsigned int concurrency;
24369c2e 8467
362a9e65 8468 mutex_lock(&ctx->uring_lock);
e941894e
JA
8469 hash = ctx->hash_map;
8470 if (!hash) {
8471 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
362a9e65
YY
8472 if (!hash) {
8473 mutex_unlock(&ctx->uring_lock);
e941894e 8474 return ERR_PTR(-ENOMEM);
362a9e65 8475 }
e941894e
JA
8476 refcount_set(&hash->refs, 1);
8477 init_waitqueue_head(&hash->wait);
8478 ctx->hash_map = hash;
24369c2e 8479 }
362a9e65 8480 mutex_unlock(&ctx->uring_lock);
24369c2e 8481
e941894e 8482 data.hash = hash;
685fe7fe 8483 data.task = task;
ebc11b6c 8484 data.free_work = io_wq_free_work;
f5fa38c5 8485 data.do_work = io_wq_submit_work;
24369c2e 8486
d25e3a3d
JA
8487 /* Do QD, or 4 * CPUS, whatever is smallest */
8488 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
24369c2e 8489
5aa75ed5 8490 return io_wq_create(concurrency, &data);
24369c2e
PB
8491}
8492
c072481d
PB
8493static __cold int io_uring_alloc_task_context(struct task_struct *task,
8494 struct io_ring_ctx *ctx)
0f212204
JA
8495{
8496 struct io_uring_task *tctx;
d8a6df10 8497 int ret;
0f212204 8498
09899b19 8499 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
0f212204
JA
8500 if (unlikely(!tctx))
8501 return -ENOMEM;
8502
d8a6df10
JA
8503 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
8504 if (unlikely(ret)) {
8505 kfree(tctx);
8506 return ret;
8507 }
8508
685fe7fe 8509 tctx->io_wq = io_init_wq_offload(ctx, task);
5aa75ed5
JA
8510 if (IS_ERR(tctx->io_wq)) {
8511 ret = PTR_ERR(tctx->io_wq);
8512 percpu_counter_destroy(&tctx->inflight);
8513 kfree(tctx);
8514 return ret;
8515 }
8516
0f212204
JA
8517 xa_init(&tctx->xa);
8518 init_waitqueue_head(&tctx->wait);
fdaf083c 8519 atomic_set(&tctx->in_idle, 0);
b303fe2e 8520 atomic_set(&tctx->inflight_tracked, 0);
0f212204 8521 task->io_uring = tctx;
7cbf1722
JA
8522 spin_lock_init(&tctx->task_lock);
8523 INIT_WQ_LIST(&tctx->task_list);
7cbf1722 8524 init_task_work(&tctx->task_work, tctx_task_work);
0f212204
JA
8525 return 0;
8526}
8527
8528void __io_uring_free(struct task_struct *tsk)
8529{
8530 struct io_uring_task *tctx = tsk->io_uring;
8531
8532 WARN_ON_ONCE(!xa_empty(&tctx->xa));
ef8eaa4e 8533 WARN_ON_ONCE(tctx->io_wq);
09899b19 8534 WARN_ON_ONCE(tctx->cached_refs);
ef8eaa4e 8535
d8a6df10 8536 percpu_counter_destroy(&tctx->inflight);
0f212204
JA
8537 kfree(tctx);
8538 tsk->io_uring = NULL;
8539}
8540
c072481d
PB
8541static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
8542 struct io_uring_params *p)
2b188cc1
JA
8543{
8544 int ret;
8545
d25e3a3d
JA
8546 /* Retain compatibility with failing for an invalid attach attempt */
8547 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
8548 IORING_SETUP_ATTACH_WQ) {
8549 struct fd f;
8550
8551 f = fdget(p->wq_fd);
8552 if (!f.file)
8553 return -ENXIO;
0cc936f7
JA
8554 if (f.file->f_op != &io_uring_fops) {
8555 fdput(f);
f2a48dd0 8556 return -EINVAL;
0cc936f7
JA
8557 }
8558 fdput(f);
d25e3a3d 8559 }
6c271ce2 8560 if (ctx->flags & IORING_SETUP_SQPOLL) {
46fe18b1 8561 struct task_struct *tsk;
534ca6d6 8562 struct io_sq_data *sqd;
26984fbf 8563 bool attached;
534ca6d6 8564
26984fbf 8565 sqd = io_get_sq_data(p, &attached);
534ca6d6
JA
8566 if (IS_ERR(sqd)) {
8567 ret = PTR_ERR(sqd);
8568 goto err;
8569 }
69fb2131 8570
7c30f36a 8571 ctx->sq_creds = get_current_cred();
534ca6d6 8572 ctx->sq_data = sqd;
917257da
JA
8573 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8574 if (!ctx->sq_thread_idle)
8575 ctx->sq_thread_idle = HZ;
8576
78d7f6ba 8577 io_sq_thread_park(sqd);
de75a3d3
PB
8578 list_add(&ctx->sqd_list, &sqd->ctx_list);
8579 io_sqd_update_thread_idle(sqd);
26984fbf 8580 /* don't attach to a dying SQPOLL thread, would be racy */
f2a48dd0 8581 ret = (attached && !sqd->thread) ? -ENXIO : 0;
78d7f6ba
PB
8582 io_sq_thread_unpark(sqd);
8583
de75a3d3
PB
8584 if (ret < 0)
8585 goto err;
8586 if (attached)
5aa75ed5 8587 return 0;
aa06165d 8588
6c271ce2 8589 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 8590 int cpu = p->sq_thread_cpu;
6c271ce2 8591
917257da 8592 ret = -EINVAL;
f2a48dd0 8593 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
e8f98f24 8594 goto err_sqpoll;
37d1e2e3 8595 sqd->sq_cpu = cpu;
6c271ce2 8596 } else {
37d1e2e3 8597 sqd->sq_cpu = -1;
6c271ce2 8598 }
37d1e2e3
JA
8599
8600 sqd->task_pid = current->pid;
5c2469e0 8601 sqd->task_tgid = current->tgid;
46fe18b1
JA
8602 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
8603 if (IS_ERR(tsk)) {
8604 ret = PTR_ERR(tsk);
e8f98f24 8605 goto err_sqpoll;
6c271ce2 8606 }
97a73a0f 8607
46fe18b1 8608 sqd->thread = tsk;
97a73a0f 8609 ret = io_uring_alloc_task_context(tsk, ctx);
46fe18b1 8610 wake_up_new_task(tsk);
0f212204
JA
8611 if (ret)
8612 goto err;
6c271ce2
JA
8613 } else if (p->flags & IORING_SETUP_SQ_AFF) {
8614 /* Can't have SQ_AFF without SQPOLL */
8615 ret = -EINVAL;
8616 goto err;
8617 }
8618
2b188cc1 8619 return 0;
f2a48dd0
PB
8620err_sqpoll:
8621 complete(&ctx->sq_data->exited);
2b188cc1 8622err:
37d1e2e3 8623 io_sq_thread_finish(ctx);
2b188cc1
JA
8624 return ret;
8625}
8626
a087e2b5
BM
8627static inline void __io_unaccount_mem(struct user_struct *user,
8628 unsigned long nr_pages)
2b188cc1
JA
8629{
8630 atomic_long_sub(nr_pages, &user->locked_vm);
8631}
8632
a087e2b5
BM
8633static inline int __io_account_mem(struct user_struct *user,
8634 unsigned long nr_pages)
2b188cc1
JA
8635{
8636 unsigned long page_limit, cur_pages, new_pages;
8637
8638 /* Don't allow more pages than we can safely lock */
8639 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8640
8641 do {
8642 cur_pages = atomic_long_read(&user->locked_vm);
8643 new_pages = cur_pages + nr_pages;
8644 if (new_pages > page_limit)
8645 return -ENOMEM;
8646 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8647 new_pages) != cur_pages);
8648
8649 return 0;
8650}
8651
26bfa89e 8652static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 8653{
62e398be 8654 if (ctx->user)
a087e2b5 8655 __io_unaccount_mem(ctx->user, nr_pages);
30975825 8656
26bfa89e
JA
8657 if (ctx->mm_account)
8658 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
8659}
8660
26bfa89e 8661static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 8662{
30975825
BM
8663 int ret;
8664
62e398be 8665 if (ctx->user) {
30975825
BM
8666 ret = __io_account_mem(ctx->user, nr_pages);
8667 if (ret)
8668 return ret;
8669 }
8670
26bfa89e
JA
8671 if (ctx->mm_account)
8672 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
8673
8674 return 0;
8675}
8676
2b188cc1
JA
8677static void io_mem_free(void *ptr)
8678{
52e04ef4
MR
8679 struct page *page;
8680
8681 if (!ptr)
8682 return;
2b188cc1 8683
52e04ef4 8684 page = virt_to_head_page(ptr);
2b188cc1
JA
8685 if (put_page_testzero(page))
8686 free_compound_page(page);
8687}
8688
8689static void *io_mem_alloc(size_t size)
8690{
8691 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
26bfa89e 8692 __GFP_NORETRY | __GFP_ACCOUNT;
2b188cc1
JA
8693
8694 return (void *) __get_free_pages(gfp_flags, get_order(size));
8695}
8696
75b28aff
HV
8697static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8698 size_t *sq_offset)
8699{
8700 struct io_rings *rings;
8701 size_t off, sq_array_size;
8702
8703 off = struct_size(rings, cqes, cq_entries);
8704 if (off == SIZE_MAX)
8705 return SIZE_MAX;
8706
8707#ifdef CONFIG_SMP
8708 off = ALIGN(off, SMP_CACHE_BYTES);
8709 if (off == 0)
8710 return SIZE_MAX;
8711#endif
8712
b36200f5
DV
8713 if (sq_offset)
8714 *sq_offset = off;
8715
75b28aff
HV
8716 sq_array_size = array_size(sizeof(u32), sq_entries);
8717 if (sq_array_size == SIZE_MAX)
8718 return SIZE_MAX;
8719
8720 if (check_add_overflow(off, sq_array_size, &off))
8721 return SIZE_MAX;
8722
75b28aff
HV
8723 return off;
8724}
8725
41edf1a5 8726static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
7f61a1e9 8727{
41edf1a5 8728 struct io_mapped_ubuf *imu = *slot;
7f61a1e9
PB
8729 unsigned int i;
8730
6224843d
PB
8731 if (imu != ctx->dummy_ubuf) {
8732 for (i = 0; i < imu->nr_bvecs; i++)
8733 unpin_user_page(imu->bvec[i].bv_page);
8734 if (imu->acct_pages)
8735 io_unaccount_mem(ctx, imu->acct_pages);
8736 kvfree(imu);
8737 }
41edf1a5 8738 *slot = NULL;
7f61a1e9
PB
8739}
8740
bd54b6fe 8741static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
edafccee 8742{
634d00df
PB
8743 io_buffer_unmap(ctx, &prsrc->buf);
8744 prsrc->buf = NULL;
bd54b6fe 8745}
edafccee 8746
bd54b6fe
BM
8747static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8748{
8749 unsigned int i;
edafccee 8750
7f61a1e9
PB
8751 for (i = 0; i < ctx->nr_user_bufs; i++)
8752 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
edafccee 8753 kfree(ctx->user_bufs);
bb6659cc 8754 io_rsrc_data_free(ctx->buf_data);
edafccee 8755 ctx->user_bufs = NULL;
bd54b6fe 8756 ctx->buf_data = NULL;
edafccee 8757 ctx->nr_user_bufs = 0;
bd54b6fe
BM
8758}
8759
0a96bbe4 8760static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
edafccee 8761{
bd54b6fe 8762 int ret;
edafccee 8763
bd54b6fe 8764 if (!ctx->buf_data)
edafccee
JA
8765 return -ENXIO;
8766
bd54b6fe
BM
8767 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
8768 if (!ret)
8769 __io_sqe_buffers_unregister(ctx);
8770 return ret;
edafccee
JA
8771}
8772
8773static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8774 void __user *arg, unsigned index)
8775{
8776 struct iovec __user *src;
8777
8778#ifdef CONFIG_COMPAT
8779 if (ctx->compat) {
8780 struct compat_iovec __user *ciovs;
8781 struct compat_iovec ciov;
8782
8783 ciovs = (struct compat_iovec __user *) arg;
8784 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8785 return -EFAULT;
8786
d55e5f5b 8787 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
8788 dst->iov_len = ciov.iov_len;
8789 return 0;
8790 }
8791#endif
8792 src = (struct iovec __user *) arg;
8793 if (copy_from_user(dst, &src[index], sizeof(*dst)))
8794 return -EFAULT;
8795 return 0;
8796}
8797
de293938
JA
8798/*
8799 * Not super efficient, but this is just a registration time. And we do cache
8800 * the last compound head, so generally we'll only do a full search if we don't
8801 * match that one.
8802 *
8803 * We check if the given compound head page has already been accounted, to
8804 * avoid double accounting it. This allows us to account the full size of the
8805 * page, not just the constituent pages of a huge page.
8806 */
8807static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8808 int nr_pages, struct page *hpage)
8809{
8810 int i, j;
8811
8812 /* check current page array */
8813 for (i = 0; i < nr_pages; i++) {
8814 if (!PageCompound(pages[i]))
8815 continue;
8816 if (compound_head(pages[i]) == hpage)
8817 return true;
8818 }
8819
8820 /* check previously registered pages */
8821 for (i = 0; i < ctx->nr_user_bufs; i++) {
41edf1a5 8822 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
de293938
JA
8823
8824 for (j = 0; j < imu->nr_bvecs; j++) {
8825 if (!PageCompound(imu->bvec[j].bv_page))
8826 continue;
8827 if (compound_head(imu->bvec[j].bv_page) == hpage)
8828 return true;
8829 }
8830 }
8831
8832 return false;
8833}
8834
8835static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8836 int nr_pages, struct io_mapped_ubuf *imu,
8837 struct page **last_hpage)
8838{
8839 int i, ret;
8840
216e5835 8841 imu->acct_pages = 0;
de293938
JA
8842 for (i = 0; i < nr_pages; i++) {
8843 if (!PageCompound(pages[i])) {
8844 imu->acct_pages++;
8845 } else {
8846 struct page *hpage;
8847
8848 hpage = compound_head(pages[i]);
8849 if (hpage == *last_hpage)
8850 continue;
8851 *last_hpage = hpage;
8852 if (headpage_already_acct(ctx, pages, i, hpage))
8853 continue;
8854 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8855 }
8856 }
8857
8858 if (!imu->acct_pages)
8859 return 0;
8860
26bfa89e 8861 ret = io_account_mem(ctx, imu->acct_pages);
de293938
JA
8862 if (ret)
8863 imu->acct_pages = 0;
8864 return ret;
8865}
8866
0a96bbe4 8867static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
41edf1a5 8868 struct io_mapped_ubuf **pimu,
0a96bbe4 8869 struct page **last_hpage)
edafccee 8870{
41edf1a5 8871 struct io_mapped_ubuf *imu = NULL;
edafccee
JA
8872 struct vm_area_struct **vmas = NULL;
8873 struct page **pages = NULL;
0a96bbe4
BM
8874 unsigned long off, start, end, ubuf;
8875 size_t size;
8876 int ret, pret, nr_pages, i;
8877
6224843d
PB
8878 if (!iov->iov_base) {
8879 *pimu = ctx->dummy_ubuf;
8880 return 0;
8881 }
8882
0a96bbe4
BM
8883 ubuf = (unsigned long) iov->iov_base;
8884 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8885 start = ubuf >> PAGE_SHIFT;
8886 nr_pages = end - start;
8887
41edf1a5 8888 *pimu = NULL;
0a96bbe4
BM
8889 ret = -ENOMEM;
8890
8891 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8892 if (!pages)
8893 goto done;
8894
8895 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8896 GFP_KERNEL);
8897 if (!vmas)
8898 goto done;
edafccee 8899
41edf1a5 8900 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
a2b4198c 8901 if (!imu)
0a96bbe4
BM
8902 goto done;
8903
8904 ret = 0;
8905 mmap_read_lock(current->mm);
8906 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8907 pages, vmas);
8908 if (pret == nr_pages) {
8909 /* don't support file backed memory */
8910 for (i = 0; i < nr_pages; i++) {
8911 struct vm_area_struct *vma = vmas[i];
8912
40dad765
PB
8913 if (vma_is_shmem(vma))
8914 continue;
0a96bbe4
BM
8915 if (vma->vm_file &&
8916 !is_file_hugepages(vma->vm_file)) {
8917 ret = -EOPNOTSUPP;
8918 break;
8919 }
8920 }
8921 } else {
8922 ret = pret < 0 ? pret : -EFAULT;
8923 }
8924 mmap_read_unlock(current->mm);
8925 if (ret) {
8926 /*
8927 * if we did partial map, or found file backed vmas,
8928 * release any pages we did get
8929 */
8930 if (pret > 0)
8931 unpin_user_pages(pages, pret);
0a96bbe4
BM
8932 goto done;
8933 }
8934
8935 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8936 if (ret) {
8937 unpin_user_pages(pages, pret);
0a96bbe4
BM
8938 goto done;
8939 }
8940
8941 off = ubuf & ~PAGE_MASK;
8942 size = iov->iov_len;
8943 for (i = 0; i < nr_pages; i++) {
8944 size_t vec_len;
8945
8946 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8947 imu->bvec[i].bv_page = pages[i];
8948 imu->bvec[i].bv_len = vec_len;
8949 imu->bvec[i].bv_offset = off;
8950 off = 0;
8951 size -= vec_len;
8952 }
8953 /* store original address for later verification */
8954 imu->ubuf = ubuf;
4751f53d 8955 imu->ubuf_end = ubuf + iov->iov_len;
0a96bbe4 8956 imu->nr_bvecs = nr_pages;
41edf1a5 8957 *pimu = imu;
0a96bbe4
BM
8958 ret = 0;
8959done:
41edf1a5
PB
8960 if (ret)
8961 kvfree(imu);
0a96bbe4
BM
8962 kvfree(pages);
8963 kvfree(vmas);
8964 return ret;
8965}
8966
2b358604 8967static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
0a96bbe4 8968{
87094465
PB
8969 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
8970 return ctx->user_bufs ? 0 : -ENOMEM;
2b358604 8971}
edafccee 8972
2b358604
BM
8973static int io_buffer_validate(struct iovec *iov)
8974{
50e96989
PB
8975 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
8976
2b358604
BM
8977 /*
8978 * Don't impose further limits on the size and buffer
8979 * constraints here, we'll -EINVAL later when IO is
8980 * submitted if they are wrong.
8981 */
6224843d
PB
8982 if (!iov->iov_base)
8983 return iov->iov_len ? -EFAULT : 0;
8984 if (!iov->iov_len)
2b358604 8985 return -EFAULT;
edafccee 8986
2b358604
BM
8987 /* arbitrary limit, but we need something */
8988 if (iov->iov_len > SZ_1G)
8989 return -EFAULT;
edafccee 8990
50e96989
PB
8991 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
8992 return -EOVERFLOW;
8993
2b358604
BM
8994 return 0;
8995}
edafccee 8996
2b358604 8997static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
634d00df 8998 unsigned int nr_args, u64 __user *tags)
2b358604 8999{
bd54b6fe
BM
9000 struct page *last_hpage = NULL;
9001 struct io_rsrc_data *data;
2b358604
BM
9002 int i, ret;
9003 struct iovec iov;
edafccee 9004
87094465
PB
9005 if (ctx->user_bufs)
9006 return -EBUSY;
489809e2 9007 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
87094465 9008 return -EINVAL;
bd54b6fe 9009 ret = io_rsrc_node_switch_start(ctx);
2b358604
BM
9010 if (ret)
9011 return ret;
d878c816
PB
9012 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9013 if (ret)
9014 return ret;
bd54b6fe
BM
9015 ret = io_buffers_map_alloc(ctx, nr_args);
9016 if (ret) {
bb6659cc 9017 io_rsrc_data_free(data);
bd54b6fe
BM
9018 return ret;
9019 }
edafccee 9020
87094465 9021 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
edafccee
JA
9022 ret = io_copy_iov(ctx, &iov, arg, i);
9023 if (ret)
0a96bbe4 9024 break;
2b358604
BM
9025 ret = io_buffer_validate(&iov);
9026 if (ret)
0a96bbe4 9027 break;
2d091d62 9028 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
cf3770e7
CIK
9029 ret = -EINVAL;
9030 break;
9031 }
edafccee 9032
41edf1a5
PB
9033 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9034 &last_hpage);
0a96bbe4
BM
9035 if (ret)
9036 break;
edafccee 9037 }
0a96bbe4 9038
bd54b6fe 9039 WARN_ON_ONCE(ctx->buf_data);
0a96bbe4 9040
bd54b6fe
BM
9041 ctx->buf_data = data;
9042 if (ret)
9043 __io_sqe_buffers_unregister(ctx);
9044 else
9045 io_rsrc_node_switch(ctx, NULL);
edafccee
JA
9046 return ret;
9047}
9048
634d00df
PB
9049static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9050 struct io_uring_rsrc_update2 *up,
9051 unsigned int nr_args)
9052{
9053 u64 __user *tags = u64_to_user_ptr(up->tags);
9054 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
634d00df
PB
9055 struct page *last_hpage = NULL;
9056 bool needs_switch = false;
9057 __u32 done;
9058 int i, err;
9059
9060 if (!ctx->buf_data)
9061 return -ENXIO;
9062 if (up->offset + nr_args > ctx->nr_user_bufs)
9063 return -EINVAL;
9064
9065 for (done = 0; done < nr_args; done++) {
0b8c0e7c
PB
9066 struct io_mapped_ubuf *imu;
9067 int offset = up->offset + done;
634d00df
PB
9068 u64 tag = 0;
9069
9070 err = io_copy_iov(ctx, &iov, iovs, done);
9071 if (err)
9072 break;
9073 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9074 err = -EFAULT;
9075 break;
9076 }
0b8c0e7c
PB
9077 err = io_buffer_validate(&iov);
9078 if (err)
9079 break;
cf3770e7
CIK
9080 if (!iov.iov_base && tag) {
9081 err = -EINVAL;
9082 break;
9083 }
0b8c0e7c
PB
9084 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9085 if (err)
9086 break;
634d00df 9087
0b8c0e7c 9088 i = array_index_nospec(offset, ctx->nr_user_bufs);
6224843d 9089 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
0b8c0e7c
PB
9090 err = io_queue_rsrc_removal(ctx->buf_data, offset,
9091 ctx->rsrc_node, ctx->user_bufs[i]);
9092 if (unlikely(err)) {
9093 io_buffer_unmap(ctx, &imu);
634d00df 9094 break;
0b8c0e7c 9095 }
634d00df
PB
9096 ctx->user_bufs[i] = NULL;
9097 needs_switch = true;
9098 }
9099
0b8c0e7c 9100 ctx->user_bufs[i] = imu;
2d091d62 9101 *io_get_tag_slot(ctx->buf_data, offset) = tag;
634d00df
PB
9102 }
9103
9104 if (needs_switch)
9105 io_rsrc_node_switch(ctx, ctx->buf_data);
9106 return done ? done : err;
9107}
9108
9b402849
JA
9109static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
9110{
9111 __s32 __user *fds = arg;
9112 int fd;
9113
9114 if (ctx->cq_ev_fd)
9115 return -EBUSY;
9116
9117 if (copy_from_user(&fd, fds, sizeof(*fds)))
9118 return -EFAULT;
9119
9120 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
9121 if (IS_ERR(ctx->cq_ev_fd)) {
9122 int ret = PTR_ERR(ctx->cq_ev_fd);
fe7e3257 9123
9b402849
JA
9124 ctx->cq_ev_fd = NULL;
9125 return ret;
9126 }
9127
9128 return 0;
9129}
9130
9131static int io_eventfd_unregister(struct io_ring_ctx *ctx)
9132{
9133 if (ctx->cq_ev_fd) {
9134 eventfd_ctx_put(ctx->cq_ev_fd);
9135 ctx->cq_ev_fd = NULL;
9136 return 0;
9137 }
9138
9139 return -ENXIO;
9140}
9141
5a2e745d
JA
9142static void io_destroy_buffers(struct io_ring_ctx *ctx)
9143{
9e15c3a0
JA
9144 struct io_buffer *buf;
9145 unsigned long index;
9146
8bab4c09 9147 xa_for_each(&ctx->io_buffers, index, buf) {
9e15c3a0 9148 __io_remove_buffers(ctx, buf, index, -1U);
8bab4c09
JA
9149 cond_resched();
9150 }
5a2e745d
JA
9151}
9152
4010fec4 9153static void io_req_caches_free(struct io_ring_ctx *ctx)
2b188cc1 9154{
cd0ca2e0 9155 struct io_submit_state *state = &ctx->submit_state;
37f0e767 9156 int nr = 0;
bf019da7 9157
9a4fdbd8 9158 mutex_lock(&ctx->uring_lock);
cd0ca2e0 9159 io_flush_cached_locked_reqs(ctx, state);
c2b6c6bc
PB
9160
9161 while (state->free_list.next) {
9162 struct io_wq_work_node *node;
9163 struct io_kiocb *req;
9164
9165 node = wq_stack_extract(&state->free_list);
9166 req = container_of(node, struct io_kiocb, comp_list);
9167 kmem_cache_free(req_cachep, req);
37f0e767 9168 nr++;
c2b6c6bc 9169 }
37f0e767
PB
9170 if (nr)
9171 percpu_ref_put_many(&ctx->refs, nr);
9a4fdbd8
JA
9172 mutex_unlock(&ctx->uring_lock);
9173}
9174
43597aac 9175static void io_wait_rsrc_data(struct io_rsrc_data *data)
2b188cc1 9176{
43597aac 9177 if (data && !atomic_dec_and_test(&data->refs))
bd54b6fe 9178 wait_for_completion(&data->done);
bd54b6fe 9179}
04fc6c80 9180
c072481d 9181static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
2b188cc1 9182{
37d1e2e3 9183 io_sq_thread_finish(ctx);
2aede0e4 9184
37d1e2e3 9185 if (ctx->mm_account) {
2aede0e4
JA
9186 mmdrop(ctx->mm_account);
9187 ctx->mm_account = NULL;
30975825 9188 }
def596e9 9189
43597aac
PB
9190 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
9191 io_wait_rsrc_data(ctx->buf_data);
9192 io_wait_rsrc_data(ctx->file_data);
9193
8bad28d8 9194 mutex_lock(&ctx->uring_lock);
43597aac 9195 if (ctx->buf_data)
bd54b6fe 9196 __io_sqe_buffers_unregister(ctx);
43597aac 9197 if (ctx->file_data)
08480400 9198 __io_sqe_files_unregister(ctx);
c4ea060e
PB
9199 if (ctx->rings)
9200 __io_cqring_overflow_flush(ctx, true);
8bad28d8 9201 mutex_unlock(&ctx->uring_lock);
9b402849 9202 io_eventfd_unregister(ctx);
5a2e745d 9203 io_destroy_buffers(ctx);
07db298a
PB
9204 if (ctx->sq_creds)
9205 put_cred(ctx->sq_creds);
def596e9 9206
a7f0ed5a
PB
9207 /* there are no registered resources left, nobody uses it */
9208 if (ctx->rsrc_node)
9209 io_rsrc_node_destroy(ctx->rsrc_node);
8dd03afe 9210 if (ctx->rsrc_backup_node)
b895c9a6 9211 io_rsrc_node_destroy(ctx->rsrc_backup_node);
a7f0ed5a 9212 flush_delayed_work(&ctx->rsrc_put_work);
756ab7c0 9213 flush_delayed_work(&ctx->fallback_work);
a7f0ed5a
PB
9214
9215 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
9216 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
def596e9 9217
2b188cc1 9218#if defined(CONFIG_UNIX)
355e8d26
EB
9219 if (ctx->ring_sock) {
9220 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 9221 sock_release(ctx->ring_sock);
355e8d26 9222 }
2b188cc1 9223#endif
ef9dd637 9224 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
2b188cc1 9225
75b28aff 9226 io_mem_free(ctx->rings);
2b188cc1 9227 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
9228
9229 percpu_ref_exit(&ctx->refs);
2b188cc1 9230 free_uid(ctx->user);
4010fec4 9231 io_req_caches_free(ctx);
e941894e
JA
9232 if (ctx->hash_map)
9233 io_wq_put_hash(ctx->hash_map);
78076bb6 9234 kfree(ctx->cancel_hash);
6224843d 9235 kfree(ctx->dummy_ubuf);
2b188cc1
JA
9236 kfree(ctx);
9237}
9238
9239static __poll_t io_uring_poll(struct file *file, poll_table *wait)
9240{
9241 struct io_ring_ctx *ctx = file->private_data;
9242 __poll_t mask = 0;
9243
d60aa65b 9244 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
9245 /*
9246 * synchronizes with barrier from wq_has_sleeper call in
9247 * io_commit_cqring
9248 */
2b188cc1 9249 smp_rmb();
90554200 9250 if (!io_sqring_full(ctx))
2b188cc1 9251 mask |= EPOLLOUT | EPOLLWRNORM;
ed670c3f
HX
9252
9253 /*
9254 * Don't flush cqring overflow list here, just do a simple check.
9255 * Otherwise there could possible be ABBA deadlock:
9256 * CPU0 CPU1
9257 * ---- ----
9258 * lock(&ctx->uring_lock);
9259 * lock(&ep->mtx);
9260 * lock(&ctx->uring_lock);
9261 * lock(&ep->mtx);
9262 *
9263 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
9264 * pushs them to do the flush.
9265 */
5ed7a37d 9266 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
2b188cc1
JA
9267 mask |= EPOLLIN | EPOLLRDNORM;
9268
9269 return mask;
9270}
9271
0bead8cd 9272static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
071698e1 9273{
4379bf8b 9274 const struct cred *creds;
071698e1 9275
61cf9370 9276 creds = xa_erase(&ctx->personalities, id);
4379bf8b
JA
9277 if (creds) {
9278 put_cred(creds);
0bead8cd 9279 return 0;
1e6fa521 9280 }
0bead8cd
YD
9281
9282 return -EINVAL;
9283}
9284
d56d938b
PB
9285struct io_tctx_exit {
9286 struct callback_head task_work;
9287 struct completion completion;
baf186c4 9288 struct io_ring_ctx *ctx;
d56d938b
PB
9289};
9290
c072481d 9291static __cold void io_tctx_exit_cb(struct callback_head *cb)
d56d938b
PB
9292{
9293 struct io_uring_task *tctx = current->io_uring;
9294 struct io_tctx_exit *work;
9295
9296 work = container_of(cb, struct io_tctx_exit, task_work);
9297 /*
9298 * When @in_idle, we're in cancellation and it's racy to remove the
9299 * node. It'll be removed by the end of cancellation, just ignore it.
9300 */
9301 if (!atomic_read(&tctx->in_idle))
eef51daa 9302 io_uring_del_tctx_node((unsigned long)work->ctx);
d56d938b
PB
9303 complete(&work->completion);
9304}
9305
c072481d 9306static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
28090c13
PB
9307{
9308 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9309
9310 return req->ctx == data;
9311}
9312
c072481d 9313static __cold void io_ring_exit_work(struct work_struct *work)
85faa7b8 9314{
d56d938b 9315 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
b5bb3a24 9316 unsigned long timeout = jiffies + HZ * 60 * 5;
58d3be2c 9317 unsigned long interval = HZ / 20;
d56d938b
PB
9318 struct io_tctx_exit exit;
9319 struct io_tctx_node *node;
9320 int ret;
85faa7b8 9321
56952e91
JA
9322 /*
9323 * If we're doing polled IO and end up having requests being
9324 * submitted async (out-of-line), then completions can come in while
9325 * we're waiting for refs to drop. We need to reap these manually,
9326 * as nobody else will be looking for them.
9327 */
b2edc0a7 9328 do {
3dd0c97a 9329 io_uring_try_cancel_requests(ctx, NULL, true);
28090c13
PB
9330 if (ctx->sq_data) {
9331 struct io_sq_data *sqd = ctx->sq_data;
9332 struct task_struct *tsk;
9333
9334 io_sq_thread_park(sqd);
9335 tsk = sqd->thread;
9336 if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
9337 io_wq_cancel_cb(tsk->io_uring->io_wq,
9338 io_cancel_ctx_cb, ctx, true);
9339 io_sq_thread_unpark(sqd);
9340 }
b5bb3a24 9341
37f0e767
PB
9342 io_req_caches_free(ctx);
9343
58d3be2c
PB
9344 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
9345 /* there is little hope left, don't run it too often */
9346 interval = HZ * 60;
9347 }
9348 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
d56d938b 9349
7f00651a
PB
9350 init_completion(&exit.completion);
9351 init_task_work(&exit.task_work, io_tctx_exit_cb);
9352 exit.ctx = ctx;
89b5066e
PB
9353 /*
9354 * Some may use context even when all refs and requests have been put,
9355 * and they are free to do so while still holding uring_lock or
5b0a6acc 9356 * completion_lock, see io_req_task_submit(). Apart from other work,
89b5066e
PB
9357 * this lock/unlock section also waits them to finish.
9358 */
d56d938b
PB
9359 mutex_lock(&ctx->uring_lock);
9360 while (!list_empty(&ctx->tctx_list)) {
b5bb3a24
PB
9361 WARN_ON_ONCE(time_after(jiffies, timeout));
9362
d56d938b
PB
9363 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
9364 ctx_node);
7f00651a
PB
9365 /* don't spin on a single task if cancellation failed */
9366 list_rotate_left(&ctx->tctx_list);
d56d938b
PB
9367 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
9368 if (WARN_ON_ONCE(ret))
9369 continue;
d56d938b
PB
9370
9371 mutex_unlock(&ctx->uring_lock);
9372 wait_for_completion(&exit.completion);
d56d938b
PB
9373 mutex_lock(&ctx->uring_lock);
9374 }
9375 mutex_unlock(&ctx->uring_lock);
79ebeaee
JA
9376 spin_lock(&ctx->completion_lock);
9377 spin_unlock(&ctx->completion_lock);
d56d938b 9378
85faa7b8
JA
9379 io_ring_ctx_free(ctx);
9380}
9381
80c4cbdb 9382/* Returns true if we found and killed one or more timeouts */
c072481d
PB
9383static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
9384 struct task_struct *tsk, bool cancel_all)
80c4cbdb
PB
9385{
9386 struct io_kiocb *req, *tmp;
9387 int canceled = 0;
9388
79ebeaee
JA
9389 spin_lock(&ctx->completion_lock);
9390 spin_lock_irq(&ctx->timeout_lock);
80c4cbdb 9391 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
3dd0c97a 9392 if (io_match_task(req, tsk, cancel_all)) {
80c4cbdb
PB
9393 io_kill_timeout(req, -ECANCELED);
9394 canceled++;
9395 }
9396 }
79ebeaee 9397 spin_unlock_irq(&ctx->timeout_lock);
51520426
PB
9398 if (canceled != 0)
9399 io_commit_cqring(ctx);
79ebeaee 9400 spin_unlock(&ctx->completion_lock);
80c4cbdb
PB
9401 if (canceled != 0)
9402 io_cqring_ev_posted(ctx);
9403 return canceled != 0;
9404}
9405
c072481d 9406static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2b188cc1 9407{
61cf9370
MWO
9408 unsigned long index;
9409 struct creds *creds;
9410
2b188cc1
JA
9411 mutex_lock(&ctx->uring_lock);
9412 percpu_ref_kill(&ctx->refs);
634578f8 9413 if (ctx->rings)
6c2450ae 9414 __io_cqring_overflow_flush(ctx, true);
61cf9370
MWO
9415 xa_for_each(&ctx->personalities, index, creds)
9416 io_unregister_personality(ctx, index);
2b188cc1
JA
9417 mutex_unlock(&ctx->uring_lock);
9418
3dd0c97a
PB
9419 io_kill_timeouts(ctx, NULL, true);
9420 io_poll_remove_all(ctx, NULL, true);
561fb04a 9421
15dff286 9422 /* if we failed setting up the ctx, we might not have any rings */
b2edc0a7 9423 io_iopoll_try_reap_events(ctx);
309fc03a 9424
85faa7b8 9425 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
fc666777
JA
9426 /*
9427 * Use system_unbound_wq to avoid spawning tons of event kworkers
9428 * if we're exiting a ton of rings at the same time. It just adds
9429 * noise and overhead, there's no discernable change in runtime
9430 * over using system_wq.
9431 */
9432 queue_work(system_unbound_wq, &ctx->exit_work);
2b188cc1
JA
9433}
9434
9435static int io_uring_release(struct inode *inode, struct file *file)
9436{
9437 struct io_ring_ctx *ctx = file->private_data;
9438
9439 file->private_data = NULL;
9440 io_ring_ctx_wait_and_kill(ctx);
9441 return 0;
9442}
9443
f6edbabb
PB
9444struct io_task_cancel {
9445 struct task_struct *task;
3dd0c97a 9446 bool all;
f6edbabb 9447};
f254ac04 9448
f6edbabb 9449static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
b711d4ea 9450{
9a472ef7 9451 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f6edbabb 9452 struct io_task_cancel *cancel = data;
9a472ef7
PB
9453 bool ret;
9454
3dd0c97a 9455 if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
9a472ef7
PB
9456 struct io_ring_ctx *ctx = req->ctx;
9457
9458 /* protect against races with linked timeouts */
79ebeaee 9459 spin_lock(&ctx->completion_lock);
3dd0c97a 9460 ret = io_match_task(req, cancel->task, cancel->all);
79ebeaee 9461 spin_unlock(&ctx->completion_lock);
9a472ef7 9462 } else {
3dd0c97a 9463 ret = io_match_task(req, cancel->task, cancel->all);
9a472ef7
PB
9464 }
9465 return ret;
b711d4ea
JA
9466}
9467
c072481d
PB
9468static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
9469 struct task_struct *task,
9470 bool cancel_all)
b7ddce3c 9471{
e1915f76 9472 struct io_defer_entry *de;
b7ddce3c
PB
9473 LIST_HEAD(list);
9474
79ebeaee 9475 spin_lock(&ctx->completion_lock);
b7ddce3c 9476 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
3dd0c97a 9477 if (io_match_task(de->req, task, cancel_all)) {
b7ddce3c
PB
9478 list_cut_position(&list, &ctx->defer_list, &de->list);
9479 break;
9480 }
9481 }
79ebeaee 9482 spin_unlock(&ctx->completion_lock);
e1915f76
PB
9483 if (list_empty(&list))
9484 return false;
b7ddce3c
PB
9485
9486 while (!list_empty(&list)) {
9487 de = list_first_entry(&list, struct io_defer_entry, list);
9488 list_del_init(&de->list);
f41db273 9489 io_req_complete_failed(de->req, -ECANCELED);
b7ddce3c
PB
9490 kfree(de);
9491 }
e1915f76 9492 return true;
b7ddce3c
PB
9493}
9494
c072481d 9495static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
1b00764f
PB
9496{
9497 struct io_tctx_node *node;
9498 enum io_wq_cancel cret;
9499 bool ret = false;
9500
9501 mutex_lock(&ctx->uring_lock);
9502 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
9503 struct io_uring_task *tctx = node->task->io_uring;
9504
9505 /*
9506 * io_wq will stay alive while we hold uring_lock, because it's
9507 * killed after ctx nodes, which requires to take the lock.
9508 */
9509 if (!tctx || !tctx->io_wq)
9510 continue;
9511 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
9512 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9513 }
9514 mutex_unlock(&ctx->uring_lock);
9515
9516 return ret;
9517}
9518
c072481d
PB
9519static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
9520 struct task_struct *task,
9521 bool cancel_all)
9936c7c2 9522{
3dd0c97a 9523 struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
1b00764f 9524 struct io_uring_task *tctx = task ? task->io_uring : NULL;
9936c7c2
PB
9525
9526 while (1) {
9527 enum io_wq_cancel cret;
9528 bool ret = false;
9529
1b00764f
PB
9530 if (!task) {
9531 ret |= io_uring_try_cancel_iowq(ctx);
9532 } else if (tctx && tctx->io_wq) {
9533 /*
9534 * Cancels requests of all rings, not only @ctx, but
9535 * it's fine as the task is in exit/exec.
9536 */
5aa75ed5 9537 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9936c7c2
PB
9538 &cancel, true);
9539 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9540 }
9541
9542 /* SQPOLL thread does its own polling */
3dd0c97a 9543 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
d052d1d6 9544 (ctx->sq_data && ctx->sq_data->thread == current)) {
5eef4e87 9545 while (!wq_list_empty(&ctx->iopoll_list)) {
9936c7c2
PB
9546 io_iopoll_try_reap_events(ctx);
9547 ret = true;
9548 }
9549 }
9550
3dd0c97a
PB
9551 ret |= io_cancel_defer_files(ctx, task, cancel_all);
9552 ret |= io_poll_remove_all(ctx, task, cancel_all);
9553 ret |= io_kill_timeouts(ctx, task, cancel_all);
e5dc480d
PB
9554 if (task)
9555 ret |= io_run_task_work();
9936c7c2
PB
9556 if (!ret)
9557 break;
9558 cond_resched();
9559 }
9560}
9561
eef51daa 9562static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
0f212204 9563{
236434c3 9564 struct io_uring_task *tctx = current->io_uring;
13bf43f5 9565 struct io_tctx_node *node;
a528b04e 9566 int ret;
236434c3
MWO
9567
9568 if (unlikely(!tctx)) {
5aa75ed5 9569 ret = io_uring_alloc_task_context(current, ctx);
0f212204
JA
9570 if (unlikely(ret))
9571 return ret;
236434c3 9572 tctx = current->io_uring;
0f212204 9573 }
cf27f3b1
PB
9574 if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
9575 node = kmalloc(sizeof(*node), GFP_KERNEL);
9576 if (!node)
9577 return -ENOMEM;
9578 node->ctx = ctx;
9579 node->task = current;
13bf43f5 9580
cf27f3b1
PB
9581 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
9582 node, GFP_KERNEL));
9583 if (ret) {
9584 kfree(node);
9585 return ret;
0f212204 9586 }
cf27f3b1
PB
9587
9588 mutex_lock(&ctx->uring_lock);
9589 list_add(&node->ctx_node, &ctx->tctx_list);
9590 mutex_unlock(&ctx->uring_lock);
0f212204 9591 }
cf27f3b1 9592 tctx->last = ctx;
0f212204
JA
9593 return 0;
9594}
9595
cf27f3b1
PB
9596/*
9597 * Note that this task has used io_uring. We use it for cancelation purposes.
9598 */
eef51daa 9599static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
cf27f3b1
PB
9600{
9601 struct io_uring_task *tctx = current->io_uring;
9602
9603 if (likely(tctx && tctx->last == ctx))
9604 return 0;
eef51daa 9605 return __io_uring_add_tctx_node(ctx);
cf27f3b1
PB
9606}
9607
0f212204
JA
9608/*
9609 * Remove this io_uring_file -> task mapping.
9610 */
c072481d 9611static __cold void io_uring_del_tctx_node(unsigned long index)
0f212204
JA
9612{
9613 struct io_uring_task *tctx = current->io_uring;
13bf43f5 9614 struct io_tctx_node *node;
2941267b 9615
eebd2e37
PB
9616 if (!tctx)
9617 return;
13bf43f5
PB
9618 node = xa_erase(&tctx->xa, index);
9619 if (!node)
2941267b 9620 return;
0f212204 9621
13bf43f5
PB
9622 WARN_ON_ONCE(current != node->task);
9623 WARN_ON_ONCE(list_empty(&node->ctx_node));
9624
9625 mutex_lock(&node->ctx->uring_lock);
9626 list_del(&node->ctx_node);
9627 mutex_unlock(&node->ctx->uring_lock);
9628
baf186c4 9629 if (tctx->last == node->ctx)
0f212204 9630 tctx->last = NULL;
13bf43f5 9631 kfree(node);
0f212204
JA
9632}
9633
c072481d 9634static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
de7f1d9e 9635{
ba5ef6dc 9636 struct io_wq *wq = tctx->io_wq;
13bf43f5 9637 struct io_tctx_node *node;
de7f1d9e
PB
9638 unsigned long index;
9639
8bab4c09 9640 xa_for_each(&tctx->xa, index, node) {
eef51daa 9641 io_uring_del_tctx_node(index);
8bab4c09
JA
9642 cond_resched();
9643 }
b16ef427
ME
9644 if (wq) {
9645 /*
9646 * Must be after io_uring_del_task_file() (removes nodes under
9647 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
9648 */
ba5ef6dc 9649 io_wq_put_and_exit(wq);
dadebc35 9650 tctx->io_wq = NULL;
b16ef427 9651 }
de7f1d9e
PB
9652}
9653
3f48cf18 9654static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
521d6a73 9655{
3f48cf18
PB
9656 if (tracked)
9657 return atomic_read(&tctx->inflight_tracked);
521d6a73
PB
9658 return percpu_counter_sum(&tctx->inflight);
9659}
9660
c072481d 9661static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
09899b19
PB
9662{
9663 struct io_uring_task *tctx = task->io_uring;
9664 unsigned int refs = tctx->cached_refs;
9665
e9dbe221
PB
9666 if (refs) {
9667 tctx->cached_refs = 0;
9668 percpu_counter_sub(&tctx->inflight, refs);
9669 put_task_struct_many(task, refs);
9670 }
09899b19
PB
9671}
9672
78cc687b
PB
9673/*
9674 * Find any io_uring ctx that this task has registered or done IO on, and cancel
9675 * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
9676 */
c072481d
PB
9677static __cold void io_uring_cancel_generic(bool cancel_all,
9678 struct io_sq_data *sqd)
0e9ddb39 9679{
521d6a73 9680 struct io_uring_task *tctx = current->io_uring;
734551df 9681 struct io_ring_ctx *ctx;
0e9ddb39
PB
9682 s64 inflight;
9683 DEFINE_WAIT(wait);
fdaf083c 9684
78cc687b
PB
9685 WARN_ON_ONCE(sqd && sqd->thread != current);
9686
6d042ffb
PO
9687 if (!current->io_uring)
9688 return;
17a91051
PB
9689 if (tctx->io_wq)
9690 io_wq_exit_start(tctx->io_wq);
9691
0e9ddb39
PB
9692 atomic_inc(&tctx->in_idle);
9693 do {
e9dbe221 9694 io_uring_drop_tctx_refs(current);
0e9ddb39 9695 /* read completions before cancelations */
78cc687b 9696 inflight = tctx_inflight(tctx, !cancel_all);
0e9ddb39
PB
9697 if (!inflight)
9698 break;
fdaf083c 9699
78cc687b
PB
9700 if (!sqd) {
9701 struct io_tctx_node *node;
9702 unsigned long index;
0f212204 9703
78cc687b
PB
9704 xa_for_each(&tctx->xa, index, node) {
9705 /* sqpoll task will cancel all its requests */
9706 if (node->ctx->sq_data)
9707 continue;
9708 io_uring_try_cancel_requests(node->ctx, current,
9709 cancel_all);
9710 }
9711 } else {
9712 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
9713 io_uring_try_cancel_requests(ctx, current,
9714 cancel_all);
9715 }
17a91051 9716
0f212204 9717 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
e9dbe221 9718 io_uring_drop_tctx_refs(current);
0f212204 9719 /*
a1bb3cd5
PB
9720 * If we've seen completions, retry without waiting. This
9721 * avoids a race where a completion comes in before we did
9722 * prepare_to_wait().
0f212204 9723 */
3dd0c97a 9724 if (inflight == tctx_inflight(tctx, !cancel_all))
a1bb3cd5 9725 schedule();
f57555ed 9726 finish_wait(&tctx->wait, &wait);
d8a6df10 9727 } while (1);
fdaf083c 9728 atomic_dec(&tctx->in_idle);
de7f1d9e 9729
8452d4a6 9730 io_uring_clean_tctx(tctx);
3dd0c97a 9731 if (cancel_all) {
3f48cf18
PB
9732 /* for exec all current's requests should be gone, kill tctx */
9733 __io_uring_free(current);
9734 }
44e728b8
PB
9735}
9736
f552a27a 9737void __io_uring_cancel(bool cancel_all)
78cc687b 9738{
f552a27a 9739 io_uring_cancel_generic(cancel_all, NULL);
78cc687b
PB
9740}
9741
6c5c240e
RP
9742static void *io_uring_validate_mmap_request(struct file *file,
9743 loff_t pgoff, size_t sz)
2b188cc1 9744{
2b188cc1 9745 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 9746 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
9747 struct page *page;
9748 void *ptr;
9749
9750 switch (offset) {
9751 case IORING_OFF_SQ_RING:
75b28aff
HV
9752 case IORING_OFF_CQ_RING:
9753 ptr = ctx->rings;
2b188cc1
JA
9754 break;
9755 case IORING_OFF_SQES:
9756 ptr = ctx->sq_sqes;
9757 break;
2b188cc1 9758 default:
6c5c240e 9759 return ERR_PTR(-EINVAL);
2b188cc1
JA
9760 }
9761
9762 page = virt_to_head_page(ptr);
a50b854e 9763 if (sz > page_size(page))
6c5c240e
RP
9764 return ERR_PTR(-EINVAL);
9765
9766 return ptr;
9767}
9768
9769#ifdef CONFIG_MMU
9770
c072481d 9771static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6c5c240e
RP
9772{
9773 size_t sz = vma->vm_end - vma->vm_start;
9774 unsigned long pfn;
9775 void *ptr;
9776
9777 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9778 if (IS_ERR(ptr))
9779 return PTR_ERR(ptr);
2b188cc1
JA
9780
9781 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9782 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9783}
9784
6c5c240e
RP
9785#else /* !CONFIG_MMU */
9786
9787static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9788{
9789 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9790}
9791
9792static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9793{
9794 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9795}
9796
9797static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9798 unsigned long addr, unsigned long len,
9799 unsigned long pgoff, unsigned long flags)
9800{
9801 void *ptr;
9802
9803 ptr = io_uring_validate_mmap_request(file, pgoff, len);
9804 if (IS_ERR(ptr))
9805 return PTR_ERR(ptr);
9806
9807 return (unsigned long) ptr;
9808}
9809
9810#endif /* !CONFIG_MMU */
9811
d9d05217 9812static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
90554200
JA
9813{
9814 DEFINE_WAIT(wait);
9815
9816 do {
9817 if (!io_sqring_full(ctx))
9818 break;
90554200
JA
9819 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9820
9821 if (!io_sqring_full(ctx))
9822 break;
90554200
JA
9823 schedule();
9824 } while (!signal_pending(current));
9825
9826 finish_wait(&ctx->sqo_sq_wait, &wait);
5199328a 9827 return 0;
90554200
JA
9828}
9829
c73ebb68
HX
9830static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9831 struct __kernel_timespec __user **ts,
9832 const sigset_t __user **sig)
9833{
9834 struct io_uring_getevents_arg arg;
9835
9836 /*
9837 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9838 * is just a pointer to the sigset_t.
9839 */
9840 if (!(flags & IORING_ENTER_EXT_ARG)) {
9841 *sig = (const sigset_t __user *) argp;
9842 *ts = NULL;
9843 return 0;
9844 }
9845
9846 /*
9847 * EXT_ARG is set - ensure we agree on the size of it and copy in our
9848 * timespec and sigset_t pointers if good.
9849 */
9850 if (*argsz != sizeof(arg))
9851 return -EINVAL;
9852 if (copy_from_user(&arg, argp, sizeof(arg)))
9853 return -EFAULT;
9854 *sig = u64_to_user_ptr(arg.sigmask);
9855 *argsz = arg.sigmask_sz;
9856 *ts = u64_to_user_ptr(arg.ts);
9857 return 0;
9858}
9859
2b188cc1 9860SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
c73ebb68
HX
9861 u32, min_complete, u32, flags, const void __user *, argp,
9862 size_t, argsz)
2b188cc1
JA
9863{
9864 struct io_ring_ctx *ctx;
2b188cc1
JA
9865 int submitted = 0;
9866 struct fd f;
33f993da 9867 long ret;
2b188cc1 9868
4c6e277c 9869 io_run_task_work();
b41e9852 9870
33f993da
PB
9871 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9872 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
2b188cc1
JA
9873 return -EINVAL;
9874
9875 f = fdget(fd);
33f993da 9876 if (unlikely(!f.file))
2b188cc1
JA
9877 return -EBADF;
9878
9879 ret = -EOPNOTSUPP;
33f993da 9880 if (unlikely(f.file->f_op != &io_uring_fops))
2b188cc1
JA
9881 goto out_fput;
9882
9883 ret = -ENXIO;
9884 ctx = f.file->private_data;
33f993da 9885 if (unlikely(!percpu_ref_tryget(&ctx->refs)))
2b188cc1
JA
9886 goto out_fput;
9887
7e84e1c7 9888 ret = -EBADFD;
33f993da 9889 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
7e84e1c7
SG
9890 goto out;
9891
6c271ce2
JA
9892 /*
9893 * For SQ polling, the thread will do all submissions and completions.
9894 * Just return the requested submit count, and wake the thread if
9895 * we were asked to.
9896 */
b2a9eada 9897 ret = 0;
6c271ce2 9898 if (ctx->flags & IORING_SETUP_SQPOLL) {
90f67366 9899 io_cqring_overflow_flush(ctx);
89448c47 9900
21f96522
JA
9901 if (unlikely(ctx->sq_data->thread == NULL)) {
9902 ret = -EOWNERDEAD;
04147488 9903 goto out;
21f96522 9904 }
6c271ce2 9905 if (flags & IORING_ENTER_SQ_WAKEUP)
534ca6d6 9906 wake_up(&ctx->sq_data->wait);
d9d05217
PB
9907 if (flags & IORING_ENTER_SQ_WAIT) {
9908 ret = io_sqpoll_wait_sq(ctx);
9909 if (ret)
9910 goto out;
9911 }
6c271ce2 9912 submitted = to_submit;
b2a9eada 9913 } else if (to_submit) {
eef51daa 9914 ret = io_uring_add_tctx_node(ctx);
0f212204
JA
9915 if (unlikely(ret))
9916 goto out;
2b188cc1 9917 mutex_lock(&ctx->uring_lock);
0f212204 9918 submitted = io_submit_sqes(ctx, to_submit);
2b188cc1 9919 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
9920
9921 if (submitted != to_submit)
9922 goto out;
2b188cc1
JA
9923 }
9924 if (flags & IORING_ENTER_GETEVENTS) {
c73ebb68
HX
9925 const sigset_t __user *sig;
9926 struct __kernel_timespec __user *ts;
9927
9928 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9929 if (unlikely(ret))
9930 goto out;
9931
2b188cc1
JA
9932 min_complete = min(min_complete, ctx->cq_entries);
9933
32b2244a
XW
9934 /*
9935 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9936 * space applications don't need to do io completion events
9937 * polling again, they can rely on io_sq_thread to do polling
9938 * work, which can reduce cpu usage and uring_lock contention.
9939 */
9940 if (ctx->flags & IORING_SETUP_IOPOLL &&
9941 !(ctx->flags & IORING_SETUP_SQPOLL)) {
7668b92a 9942 ret = io_iopoll_check(ctx, min_complete);
def596e9 9943 } else {
c73ebb68 9944 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
def596e9 9945 }
2b188cc1
JA
9946 }
9947
7c504e65 9948out:
6805b32e 9949 percpu_ref_put(&ctx->refs);
2b188cc1
JA
9950out_fput:
9951 fdput(f);
9952 return submitted ? submitted : ret;
9953}
9954
bebdb65e 9955#ifdef CONFIG_PROC_FS
c072481d 9956static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
61cf9370 9957 const struct cred *cred)
87ce955b 9958{
87ce955b
JA
9959 struct user_namespace *uns = seq_user_ns(m);
9960 struct group_info *gi;
9961 kernel_cap_t cap;
9962 unsigned __capi;
9963 int g;
9964
9965 seq_printf(m, "%5d\n", id);
9966 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9967 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9968 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9969 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9970 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9971 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9972 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9973 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9974 seq_puts(m, "\n\tGroups:\t");
9975 gi = cred->group_info;
9976 for (g = 0; g < gi->ngroups; g++) {
9977 seq_put_decimal_ull(m, g ? " " : "",
9978 from_kgid_munged(uns, gi->gid[g]));
9979 }
9980 seq_puts(m, "\n\tCapEff:\t");
9981 cap = cred->cap_effective;
9982 CAP_FOR_EACH_U32(__capi)
9983 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9984 seq_putc(m, '\n');
9985 return 0;
9986}
9987
c072481d
PB
9988static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
9989 struct seq_file *m)
87ce955b 9990{
dbbe9c64 9991 struct io_sq_data *sq = NULL;
83f84356
HX
9992 struct io_overflow_cqe *ocqe;
9993 struct io_rings *r = ctx->rings;
9994 unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
9995 unsigned int cached_sq_head = ctx->cached_sq_head;
9996 unsigned int cached_cq_tail = ctx->cached_cq_tail;
9997 unsigned int sq_head = READ_ONCE(r->sq.head);
9998 unsigned int sq_tail = READ_ONCE(r->sq.tail);
9999 unsigned int cq_head = READ_ONCE(r->cq.head);
10000 unsigned int cq_tail = READ_ONCE(r->cq.tail);
fad8e0de 10001 bool has_lock;
83f84356
HX
10002 unsigned int i;
10003
10004 /*
10005 * we may get imprecise sqe and cqe info if uring is actively running
10006 * since we get cached_sq_head and cached_cq_tail without uring_lock
10007 * and sq_tail and cq_head are changed by userspace. But it's ok since
10008 * we usually use these info when it is stuck.
10009 */
10010 seq_printf(m, "SqHead:\t%u\n", sq_head & sq_mask);
10011 seq_printf(m, "SqTail:\t%u\n", sq_tail & sq_mask);
10012 seq_printf(m, "CachedSqHead:\t%u\n", cached_sq_head & sq_mask);
10013 seq_printf(m, "CqHead:\t%u\n", cq_head & cq_mask);
10014 seq_printf(m, "CqTail:\t%u\n", cq_tail & cq_mask);
10015 seq_printf(m, "CachedCqTail:\t%u\n", cached_cq_tail & cq_mask);
10016 seq_printf(m, "SQEs:\t%u\n", sq_tail - cached_sq_head);
10017 for (i = cached_sq_head; i < sq_tail; i++) {
10018 unsigned int sq_idx = READ_ONCE(ctx->sq_array[i & sq_mask]);
10019
10020 if (likely(sq_idx <= sq_mask)) {
10021 struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx];
10022
10023 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10024 sq_idx, sqe->opcode, sqe->fd, sqe->flags, sqe->user_data);
10025 }
10026 }
10027 seq_printf(m, "CQEs:\t%u\n", cached_cq_tail - cq_head);
10028 for (i = cq_head; i < cached_cq_tail; i++) {
10029 struct io_uring_cqe *cqe = &r->cqes[i & cq_mask];
10030
10031 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
10032 i & cq_mask, cqe->user_data, cqe->res, cqe->flags);
10033 }
87ce955b 10034
fad8e0de
JA
10035 /*
10036 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10037 * since fdinfo case grabs it in the opposite direction of normal use
10038 * cases. If we fail to get the lock, we just don't iterate any
10039 * structures that could be going away outside the io_uring mutex.
10040 */
10041 has_lock = mutex_trylock(&ctx->uring_lock);
10042
5f3f26f9 10043 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
dbbe9c64 10044 sq = ctx->sq_data;
5f3f26f9
JA
10045 if (!sq->thread)
10046 sq = NULL;
10047 }
dbbe9c64
JQ
10048
10049 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10050 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
87ce955b 10051 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
fad8e0de 10052 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
7b29f92d 10053 struct file *f = io_file_from_index(ctx, i);
87ce955b 10054
87ce955b
JA
10055 if (f)
10056 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10057 else
10058 seq_printf(m, "%5u: <none>\n", i);
10059 }
10060 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
fad8e0de 10061 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
41edf1a5 10062 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
4751f53d 10063 unsigned int len = buf->ubuf_end - buf->ubuf;
87ce955b 10064
4751f53d 10065 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
87ce955b 10066 }
61cf9370
MWO
10067 if (has_lock && !xa_empty(&ctx->personalities)) {
10068 unsigned long index;
10069 const struct cred *cred;
10070
87ce955b 10071 seq_printf(m, "Personalities:\n");
61cf9370
MWO
10072 xa_for_each(&ctx->personalities, index, cred)
10073 io_uring_show_cred(m, index, cred);
87ce955b 10074 }
83f84356
HX
10075 if (has_lock)
10076 mutex_unlock(&ctx->uring_lock);
10077
10078 seq_puts(m, "PollList:\n");
79ebeaee 10079 spin_lock(&ctx->completion_lock);
d7718a9d
JA
10080 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10081 struct hlist_head *list = &ctx->cancel_hash[i];
10082 struct io_kiocb *req;
10083
10084 hlist_for_each_entry(req, list, hash_node)
10085 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
10086 req->task->task_works != NULL);
10087 }
83f84356
HX
10088
10089 seq_puts(m, "CqOverflowList:\n");
10090 list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10091 struct io_uring_cqe *cqe = &ocqe->cqe;
10092
10093 seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
10094 cqe->user_data, cqe->res, cqe->flags);
10095
10096 }
10097
79ebeaee 10098 spin_unlock(&ctx->completion_lock);
87ce955b
JA
10099}
10100
c072481d 10101static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
87ce955b
JA
10102{
10103 struct io_ring_ctx *ctx = f->private_data;
10104
10105 if (percpu_ref_tryget(&ctx->refs)) {
10106 __io_uring_show_fdinfo(ctx, m);
10107 percpu_ref_put(&ctx->refs);
10108 }
10109}
bebdb65e 10110#endif
87ce955b 10111
2b188cc1
JA
10112static const struct file_operations io_uring_fops = {
10113 .release = io_uring_release,
10114 .mmap = io_uring_mmap,
6c5c240e
RP
10115#ifndef CONFIG_MMU
10116 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
10117 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
10118#endif
2b188cc1 10119 .poll = io_uring_poll,
bebdb65e 10120#ifdef CONFIG_PROC_FS
87ce955b 10121 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 10122#endif
2b188cc1
JA
10123};
10124
c072481d
PB
10125static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10126 struct io_uring_params *p)
2b188cc1 10127{
75b28aff
HV
10128 struct io_rings *rings;
10129 size_t size, sq_array_offset;
2b188cc1 10130
bd740481
JA
10131 /* make sure these are sane, as we already accounted them */
10132 ctx->sq_entries = p->sq_entries;
10133 ctx->cq_entries = p->cq_entries;
10134
75b28aff
HV
10135 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
10136 if (size == SIZE_MAX)
10137 return -EOVERFLOW;
10138
10139 rings = io_mem_alloc(size);
10140 if (!rings)
2b188cc1
JA
10141 return -ENOMEM;
10142
75b28aff
HV
10143 ctx->rings = rings;
10144 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
10145 rings->sq_ring_mask = p->sq_entries - 1;
10146 rings->cq_ring_mask = p->cq_entries - 1;
10147 rings->sq_ring_entries = p->sq_entries;
10148 rings->cq_ring_entries = p->cq_entries;
2b188cc1
JA
10149
10150 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
10151 if (size == SIZE_MAX) {
10152 io_mem_free(ctx->rings);
10153 ctx->rings = NULL;
2b188cc1 10154 return -EOVERFLOW;
eb065d30 10155 }
2b188cc1
JA
10156
10157 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
10158 if (!ctx->sq_sqes) {
10159 io_mem_free(ctx->rings);
10160 ctx->rings = NULL;
2b188cc1 10161 return -ENOMEM;
eb065d30 10162 }
2b188cc1 10163
2b188cc1
JA
10164 return 0;
10165}
10166
9faadcc8
PB
10167static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
10168{
10169 int ret, fd;
10170
10171 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
10172 if (fd < 0)
10173 return fd;
10174
eef51daa 10175 ret = io_uring_add_tctx_node(ctx);
9faadcc8
PB
10176 if (ret) {
10177 put_unused_fd(fd);
10178 return ret;
10179 }
10180 fd_install(fd, file);
10181 return fd;
10182}
10183
2b188cc1
JA
10184/*
10185 * Allocate an anonymous fd, this is what constitutes the application
10186 * visible backing of an io_uring instance. The application mmaps this
10187 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
10188 * we have to tie this fd to a socket for file garbage collection purposes.
10189 */
9faadcc8 10190static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
2b188cc1
JA
10191{
10192 struct file *file;
9faadcc8 10193#if defined(CONFIG_UNIX)
2b188cc1
JA
10194 int ret;
10195
2b188cc1
JA
10196 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
10197 &ctx->ring_sock);
10198 if (ret)
9faadcc8 10199 return ERR_PTR(ret);
2b188cc1
JA
10200#endif
10201
2b188cc1
JA
10202 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
10203 O_RDWR | O_CLOEXEC);
2b188cc1 10204#if defined(CONFIG_UNIX)
9faadcc8
PB
10205 if (IS_ERR(file)) {
10206 sock_release(ctx->ring_sock);
10207 ctx->ring_sock = NULL;
10208 } else {
10209 ctx->ring_sock->file = file;
0f212204 10210 }
2b188cc1 10211#endif
9faadcc8 10212 return file;
2b188cc1
JA
10213}
10214
c072481d
PB
10215static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
10216 struct io_uring_params __user *params)
2b188cc1 10217{
2b188cc1 10218 struct io_ring_ctx *ctx;
9faadcc8 10219 struct file *file;
2b188cc1
JA
10220 int ret;
10221
8110c1a6 10222 if (!entries)
2b188cc1 10223 return -EINVAL;
8110c1a6
JA
10224 if (entries > IORING_MAX_ENTRIES) {
10225 if (!(p->flags & IORING_SETUP_CLAMP))
10226 return -EINVAL;
10227 entries = IORING_MAX_ENTRIES;
10228 }
2b188cc1
JA
10229
10230 /*
10231 * Use twice as many entries for the CQ ring. It's possible for the
10232 * application to drive a higher depth than the size of the SQ ring,
10233 * since the sqes are only used at submission time. This allows for
33a107f0
JA
10234 * some flexibility in overcommitting a bit. If the application has
10235 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
10236 * of CQ ring entries manually.
2b188cc1
JA
10237 */
10238 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
10239 if (p->flags & IORING_SETUP_CQSIZE) {
10240 /*
10241 * If IORING_SETUP_CQSIZE is set, we do the same roundup
10242 * to a power-of-two, if it isn't already. We do NOT impose
10243 * any cq vs sq ring sizing.
10244 */
eb2667b3 10245 if (!p->cq_entries)
33a107f0 10246 return -EINVAL;
8110c1a6
JA
10247 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
10248 if (!(p->flags & IORING_SETUP_CLAMP))
10249 return -EINVAL;
10250 p->cq_entries = IORING_MAX_CQ_ENTRIES;
10251 }
eb2667b3
JQ
10252 p->cq_entries = roundup_pow_of_two(p->cq_entries);
10253 if (p->cq_entries < p->sq_entries)
10254 return -EINVAL;
33a107f0
JA
10255 } else {
10256 p->cq_entries = 2 * p->sq_entries;
10257 }
2b188cc1 10258
2b188cc1 10259 ctx = io_ring_ctx_alloc(p);
62e398be 10260 if (!ctx)
2b188cc1 10261 return -ENOMEM;
2b188cc1 10262 ctx->compat = in_compat_syscall();
62e398be
JA
10263 if (!capable(CAP_IPC_LOCK))
10264 ctx->user = get_uid(current_user());
2aede0e4
JA
10265
10266 /*
10267 * This is just grabbed for accounting purposes. When a process exits,
10268 * the mm is exited and dropped before the files, hence we need to hang
10269 * on to this mm purely for the purposes of being able to unaccount
10270 * memory (locked/pinned vm). It's not used for anything else.
10271 */
6b7898eb 10272 mmgrab(current->mm);
2aede0e4 10273 ctx->mm_account = current->mm;
6b7898eb 10274
2b188cc1
JA
10275 ret = io_allocate_scq_urings(ctx, p);
10276 if (ret)
10277 goto err;
10278
7e84e1c7 10279 ret = io_sq_offload_create(ctx, p);
2b188cc1
JA
10280 if (ret)
10281 goto err;
eae071c9 10282 /* always set a rsrc node */
47b228ce
PB
10283 ret = io_rsrc_node_switch_start(ctx);
10284 if (ret)
10285 goto err;
eae071c9 10286 io_rsrc_node_switch(ctx, NULL);
2b188cc1 10287
2b188cc1 10288 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
10289 p->sq_off.head = offsetof(struct io_rings, sq.head);
10290 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
10291 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
10292 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
10293 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
10294 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
10295 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
10296
10297 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
10298 p->cq_off.head = offsetof(struct io_rings, cq.head);
10299 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
10300 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
10301 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
10302 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
10303 p->cq_off.cqes = offsetof(struct io_rings, cqes);
0d9b5b3a 10304 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
ac90f249 10305
7f13657d
XW
10306 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
10307 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
5769a351 10308 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
c73ebb68 10309 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9690557e
PB
10310 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
10311 IORING_FEAT_RSRC_TAGS;
7f13657d
XW
10312
10313 if (copy_to_user(params, p, sizeof(*p))) {
10314 ret = -EFAULT;
10315 goto err;
10316 }
d1719f70 10317
9faadcc8
PB
10318 file = io_uring_get_file(ctx);
10319 if (IS_ERR(file)) {
10320 ret = PTR_ERR(file);
10321 goto err;
10322 }
10323
044c1ab3
JA
10324 /*
10325 * Install ring fd as the very last thing, so we don't risk someone
10326 * having closed it before we finish setup
10327 */
9faadcc8
PB
10328 ret = io_uring_install_fd(ctx, file);
10329 if (ret < 0) {
10330 /* fput will clean it up */
10331 fput(file);
10332 return ret;
10333 }
044c1ab3 10334
c826bd7a 10335 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
10336 return ret;
10337err:
10338 io_ring_ctx_wait_and_kill(ctx);
10339 return ret;
10340}
10341
10342/*
10343 * Sets up an aio uring context, and returns the fd. Applications asks for a
10344 * ring size, we return the actual sq/cq ring sizes (among other things) in the
10345 * params structure passed in.
10346 */
10347static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
10348{
10349 struct io_uring_params p;
2b188cc1
JA
10350 int i;
10351
10352 if (copy_from_user(&p, params, sizeof(p)))
10353 return -EFAULT;
10354 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
10355 if (p.resv[i])
10356 return -EINVAL;
10357 }
10358
6c271ce2 10359 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 10360 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7e84e1c7
SG
10361 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
10362 IORING_SETUP_R_DISABLED))
2b188cc1
JA
10363 return -EINVAL;
10364
7f13657d 10365 return io_uring_create(entries, &p, params);
2b188cc1
JA
10366}
10367
10368SYSCALL_DEFINE2(io_uring_setup, u32, entries,
10369 struct io_uring_params __user *, params)
10370{
10371 return io_uring_setup(entries, params);
10372}
10373
c072481d
PB
10374static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
10375 unsigned nr_args)
66f4af93
JA
10376{
10377 struct io_uring_probe *p;
10378 size_t size;
10379 int i, ret;
10380
10381 size = struct_size(p, ops, nr_args);
10382 if (size == SIZE_MAX)
10383 return -EOVERFLOW;
10384 p = kzalloc(size, GFP_KERNEL);
10385 if (!p)
10386 return -ENOMEM;
10387
10388 ret = -EFAULT;
10389 if (copy_from_user(p, arg, size))
10390 goto out;
10391 ret = -EINVAL;
10392 if (memchr_inv(p, 0, size))
10393 goto out;
10394
10395 p->last_op = IORING_OP_LAST - 1;
10396 if (nr_args > IORING_OP_LAST)
10397 nr_args = IORING_OP_LAST;
10398
10399 for (i = 0; i < nr_args; i++) {
10400 p->ops[i].op = i;
10401 if (!io_op_defs[i].not_supported)
10402 p->ops[i].flags = IO_URING_OP_SUPPORTED;
10403 }
10404 p->ops_len = i;
10405
10406 ret = 0;
10407 if (copy_to_user(arg, p, size))
10408 ret = -EFAULT;
10409out:
10410 kfree(p);
10411 return ret;
10412}
10413
071698e1
JA
10414static int io_register_personality(struct io_ring_ctx *ctx)
10415{
4379bf8b 10416 const struct cred *creds;
61cf9370 10417 u32 id;
1e6fa521 10418 int ret;
071698e1 10419
4379bf8b 10420 creds = get_current_cred();
1e6fa521 10421
61cf9370
MWO
10422 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
10423 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
a30f895a
JA
10424 if (ret < 0) {
10425 put_cred(creds);
10426 return ret;
10427 }
10428 return id;
071698e1
JA
10429}
10430
c072481d
PB
10431static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
10432 void __user *arg, unsigned int nr_args)
21b55dbc
SG
10433{
10434 struct io_uring_restriction *res;
10435 size_t size;
10436 int i, ret;
10437
7e84e1c7
SG
10438 /* Restrictions allowed only if rings started disabled */
10439 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10440 return -EBADFD;
10441
21b55dbc 10442 /* We allow only a single restrictions registration */
7e84e1c7 10443 if (ctx->restrictions.registered)
21b55dbc
SG
10444 return -EBUSY;
10445
10446 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
10447 return -EINVAL;
10448
10449 size = array_size(nr_args, sizeof(*res));
10450 if (size == SIZE_MAX)
10451 return -EOVERFLOW;
10452
10453 res = memdup_user(arg, size);
10454 if (IS_ERR(res))
10455 return PTR_ERR(res);
10456
10457 ret = 0;
10458
10459 for (i = 0; i < nr_args; i++) {
10460 switch (res[i].opcode) {
10461 case IORING_RESTRICTION_REGISTER_OP:
10462 if (res[i].register_op >= IORING_REGISTER_LAST) {
10463 ret = -EINVAL;
10464 goto out;
10465 }
10466
10467 __set_bit(res[i].register_op,
10468 ctx->restrictions.register_op);
10469 break;
10470 case IORING_RESTRICTION_SQE_OP:
10471 if (res[i].sqe_op >= IORING_OP_LAST) {
10472 ret = -EINVAL;
10473 goto out;
10474 }
10475
10476 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10477 break;
10478 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10479 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10480 break;
10481 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10482 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10483 break;
10484 default:
10485 ret = -EINVAL;
10486 goto out;
10487 }
10488 }
10489
10490out:
10491 /* Reset all restrictions if an error happened */
10492 if (ret != 0)
10493 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10494 else
7e84e1c7 10495 ctx->restrictions.registered = true;
21b55dbc
SG
10496
10497 kfree(res);
10498 return ret;
10499}
10500
7e84e1c7
SG
10501static int io_register_enable_rings(struct io_ring_ctx *ctx)
10502{
10503 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10504 return -EBADFD;
10505
10506 if (ctx->restrictions.registered)
10507 ctx->restricted = 1;
10508
0298ef96
PB
10509 ctx->flags &= ~IORING_SETUP_R_DISABLED;
10510 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
10511 wake_up(&ctx->sq_data->wait);
7e84e1c7
SG
10512 return 0;
10513}
10514
fdecb662 10515static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 10516 struct io_uring_rsrc_update2 *up,
98f0b3b4
PB
10517 unsigned nr_args)
10518{
10519 __u32 tmp;
10520 int err;
10521
c3bdad02
PB
10522 if (up->resv)
10523 return -EINVAL;
98f0b3b4
PB
10524 if (check_add_overflow(up->offset, nr_args, &tmp))
10525 return -EOVERFLOW;
10526 err = io_rsrc_node_switch_start(ctx);
10527 if (err)
10528 return err;
10529
fdecb662
PB
10530 switch (type) {
10531 case IORING_RSRC_FILE:
98f0b3b4 10532 return __io_sqe_files_update(ctx, up, nr_args);
634d00df
PB
10533 case IORING_RSRC_BUFFER:
10534 return __io_sqe_buffers_update(ctx, up, nr_args);
98f0b3b4
PB
10535 }
10536 return -EINVAL;
10537}
10538
c3bdad02
PB
10539static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
10540 unsigned nr_args)
98f0b3b4 10541{
c3bdad02 10542 struct io_uring_rsrc_update2 up;
98f0b3b4
PB
10543
10544 if (!nr_args)
10545 return -EINVAL;
c3bdad02
PB
10546 memset(&up, 0, sizeof(up));
10547 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
10548 return -EFAULT;
10549 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
10550}
10551
10552static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
992da01a 10553 unsigned size, unsigned type)
c3bdad02
PB
10554{
10555 struct io_uring_rsrc_update2 up;
10556
10557 if (size != sizeof(up))
10558 return -EINVAL;
98f0b3b4
PB
10559 if (copy_from_user(&up, arg, sizeof(up)))
10560 return -EFAULT;
992da01a 10561 if (!up.nr || up.resv)
98f0b3b4 10562 return -EINVAL;
992da01a 10563 return __io_register_rsrc_update(ctx, type, &up, up.nr);
98f0b3b4
PB
10564}
10565
c072481d 10566static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
992da01a 10567 unsigned int size, unsigned int type)
792e3582
PB
10568{
10569 struct io_uring_rsrc_register rr;
10570
10571 /* keep it extendible */
10572 if (size != sizeof(rr))
10573 return -EINVAL;
10574
10575 memset(&rr, 0, sizeof(rr));
10576 if (copy_from_user(&rr, arg, size))
10577 return -EFAULT;
992da01a 10578 if (!rr.nr || rr.resv || rr.resv2)
792e3582
PB
10579 return -EINVAL;
10580
992da01a 10581 switch (type) {
792e3582
PB
10582 case IORING_RSRC_FILE:
10583 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10584 rr.nr, u64_to_user_ptr(rr.tags));
634d00df
PB
10585 case IORING_RSRC_BUFFER:
10586 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10587 rr.nr, u64_to_user_ptr(rr.tags));
792e3582
PB
10588 }
10589 return -EINVAL;
10590}
10591
c072481d
PB
10592static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
10593 void __user *arg, unsigned len)
fe76421d
JA
10594{
10595 struct io_uring_task *tctx = current->io_uring;
10596 cpumask_var_t new_mask;
10597 int ret;
10598
10599 if (!tctx || !tctx->io_wq)
10600 return -EINVAL;
10601
10602 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10603 return -ENOMEM;
10604
10605 cpumask_clear(new_mask);
10606 if (len > cpumask_size())
10607 len = cpumask_size();
10608
10609 if (copy_from_user(new_mask, arg, len)) {
10610 free_cpumask_var(new_mask);
10611 return -EFAULT;
10612 }
10613
10614 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10615 free_cpumask_var(new_mask);
10616 return ret;
10617}
10618
c072481d 10619static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
fe76421d
JA
10620{
10621 struct io_uring_task *tctx = current->io_uring;
10622
10623 if (!tctx || !tctx->io_wq)
10624 return -EINVAL;
10625
10626 return io_wq_cpu_affinity(tctx->io_wq, NULL);
10627}
10628
c072481d
PB
10629static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
10630 void __user *arg)
2e480058 10631{
fa84693b
JA
10632 struct io_uring_task *tctx = NULL;
10633 struct io_sq_data *sqd = NULL;
2e480058
JA
10634 __u32 new_count[2];
10635 int i, ret;
10636
2e480058
JA
10637 if (copy_from_user(new_count, arg, sizeof(new_count)))
10638 return -EFAULT;
10639 for (i = 0; i < ARRAY_SIZE(new_count); i++)
10640 if (new_count[i] > INT_MAX)
10641 return -EINVAL;
10642
fa84693b
JA
10643 if (ctx->flags & IORING_SETUP_SQPOLL) {
10644 sqd = ctx->sq_data;
10645 if (sqd) {
009ad9f0
JA
10646 /*
10647 * Observe the correct sqd->lock -> ctx->uring_lock
10648 * ordering. Fine to drop uring_lock here, we hold
10649 * a ref to the ctx.
10650 */
41d3a6bd 10651 refcount_inc(&sqd->refs);
009ad9f0 10652 mutex_unlock(&ctx->uring_lock);
fa84693b 10653 mutex_lock(&sqd->lock);
009ad9f0 10654 mutex_lock(&ctx->uring_lock);
41d3a6bd
JA
10655 if (sqd->thread)
10656 tctx = sqd->thread->io_uring;
fa84693b
JA
10657 }
10658 } else {
10659 tctx = current->io_uring;
10660 }
10661
10662 ret = -EINVAL;
10663 if (!tctx || !tctx->io_wq)
10664 goto err;
10665
2e480058
JA
10666 ret = io_wq_max_workers(tctx->io_wq, new_count);
10667 if (ret)
fa84693b
JA
10668 goto err;
10669
41d3a6bd 10670 if (sqd) {
fa84693b 10671 mutex_unlock(&sqd->lock);
41d3a6bd
JA
10672 io_put_sq_data(sqd);
10673 }
2e480058
JA
10674
10675 if (copy_to_user(arg, new_count, sizeof(new_count)))
10676 return -EFAULT;
10677
10678 return 0;
fa84693b 10679err:
41d3a6bd 10680 if (sqd) {
fa84693b 10681 mutex_unlock(&sqd->lock);
41d3a6bd
JA
10682 io_put_sq_data(sqd);
10683 }
fa84693b 10684 return ret;
2e480058
JA
10685}
10686
071698e1
JA
10687static bool io_register_op_must_quiesce(int op)
10688{
10689 switch (op) {
bd54b6fe
BM
10690 case IORING_REGISTER_BUFFERS:
10691 case IORING_UNREGISTER_BUFFERS:
f4f7d21c 10692 case IORING_REGISTER_FILES:
071698e1
JA
10693 case IORING_UNREGISTER_FILES:
10694 case IORING_REGISTER_FILES_UPDATE:
10695 case IORING_REGISTER_PROBE:
10696 case IORING_REGISTER_PERSONALITY:
10697 case IORING_UNREGISTER_PERSONALITY:
992da01a
PB
10698 case IORING_REGISTER_FILES2:
10699 case IORING_REGISTER_FILES_UPDATE2:
10700 case IORING_REGISTER_BUFFERS2:
10701 case IORING_REGISTER_BUFFERS_UPDATE:
fe76421d
JA
10702 case IORING_REGISTER_IOWQ_AFF:
10703 case IORING_UNREGISTER_IOWQ_AFF:
2e480058 10704 case IORING_REGISTER_IOWQ_MAX_WORKERS:
071698e1
JA
10705 return false;
10706 default:
10707 return true;
10708 }
10709}
10710
c072481d 10711static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
e73c5c7c
PB
10712{
10713 long ret;
10714
10715 percpu_ref_kill(&ctx->refs);
10716
10717 /*
10718 * Drop uring mutex before waiting for references to exit. If another
10719 * thread is currently inside io_uring_enter() it might need to grab the
10720 * uring_lock to make progress. If we hold it here across the drain
10721 * wait, then we can deadlock. It's safe to drop the mutex here, since
10722 * no new references will come in after we've killed the percpu ref.
10723 */
10724 mutex_unlock(&ctx->uring_lock);
10725 do {
37f0e767
PB
10726 ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
10727 if (ret) {
10728 ret = min(0L, ret);
e73c5c7c 10729 break;
37f0e767
PB
10730 }
10731
e73c5c7c 10732 ret = io_run_task_work_sig();
37f0e767 10733 io_req_caches_free(ctx);
e73c5c7c
PB
10734 } while (ret >= 0);
10735 mutex_lock(&ctx->uring_lock);
10736
10737 if (ret)
10738 io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10739 return ret;
10740}
10741
edafccee
JA
10742static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10743 void __user *arg, unsigned nr_args)
b19062a5
JA
10744 __releases(ctx->uring_lock)
10745 __acquires(ctx->uring_lock)
edafccee
JA
10746{
10747 int ret;
10748
35fa71a0
JA
10749 /*
10750 * We're inside the ring mutex, if the ref is already dying, then
10751 * someone else killed the ctx or is already going through
10752 * io_uring_register().
10753 */
10754 if (percpu_ref_is_dying(&ctx->refs))
10755 return -ENXIO;
10756
75c4021a
PB
10757 if (ctx->restricted) {
10758 if (opcode >= IORING_REGISTER_LAST)
10759 return -EINVAL;
10760 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10761 if (!test_bit(opcode, ctx->restrictions.register_op))
10762 return -EACCES;
10763 }
10764
071698e1 10765 if (io_register_op_must_quiesce(opcode)) {
e73c5c7c
PB
10766 ret = io_ctx_quiesce(ctx);
10767 if (ret)
f70865db 10768 return ret;
05f3fb3c 10769 }
edafccee
JA
10770
10771 switch (opcode) {
10772 case IORING_REGISTER_BUFFERS:
634d00df 10773 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
edafccee
JA
10774 break;
10775 case IORING_UNREGISTER_BUFFERS:
10776 ret = -EINVAL;
10777 if (arg || nr_args)
10778 break;
0a96bbe4 10779 ret = io_sqe_buffers_unregister(ctx);
edafccee 10780 break;
6b06314c 10781 case IORING_REGISTER_FILES:
792e3582 10782 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
6b06314c
JA
10783 break;
10784 case IORING_UNREGISTER_FILES:
10785 ret = -EINVAL;
10786 if (arg || nr_args)
10787 break;
10788 ret = io_sqe_files_unregister(ctx);
10789 break;
c3a31e60 10790 case IORING_REGISTER_FILES_UPDATE:
c3bdad02 10791 ret = io_register_files_update(ctx, arg, nr_args);
c3a31e60 10792 break;
9b402849 10793 case IORING_REGISTER_EVENTFD:
f2842ab5 10794 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
10795 ret = -EINVAL;
10796 if (nr_args != 1)
10797 break;
10798 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
10799 if (ret)
10800 break;
10801 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10802 ctx->eventfd_async = 1;
10803 else
10804 ctx->eventfd_async = 0;
9b402849
JA
10805 break;
10806 case IORING_UNREGISTER_EVENTFD:
10807 ret = -EINVAL;
10808 if (arg || nr_args)
10809 break;
10810 ret = io_eventfd_unregister(ctx);
10811 break;
66f4af93
JA
10812 case IORING_REGISTER_PROBE:
10813 ret = -EINVAL;
10814 if (!arg || nr_args > 256)
10815 break;
10816 ret = io_probe(ctx, arg, nr_args);
10817 break;
071698e1
JA
10818 case IORING_REGISTER_PERSONALITY:
10819 ret = -EINVAL;
10820 if (arg || nr_args)
10821 break;
10822 ret = io_register_personality(ctx);
10823 break;
10824 case IORING_UNREGISTER_PERSONALITY:
10825 ret = -EINVAL;
10826 if (arg)
10827 break;
10828 ret = io_unregister_personality(ctx, nr_args);
10829 break;
7e84e1c7
SG
10830 case IORING_REGISTER_ENABLE_RINGS:
10831 ret = -EINVAL;
10832 if (arg || nr_args)
10833 break;
10834 ret = io_register_enable_rings(ctx);
10835 break;
21b55dbc
SG
10836 case IORING_REGISTER_RESTRICTIONS:
10837 ret = io_register_restrictions(ctx, arg, nr_args);
10838 break;
992da01a
PB
10839 case IORING_REGISTER_FILES2:
10840 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
10841 break;
10842 case IORING_REGISTER_FILES_UPDATE2:
10843 ret = io_register_rsrc_update(ctx, arg, nr_args,
10844 IORING_RSRC_FILE);
10845 break;
10846 case IORING_REGISTER_BUFFERS2:
10847 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
792e3582 10848 break;
992da01a
PB
10849 case IORING_REGISTER_BUFFERS_UPDATE:
10850 ret = io_register_rsrc_update(ctx, arg, nr_args,
10851 IORING_RSRC_BUFFER);
c3bdad02 10852 break;
fe76421d
JA
10853 case IORING_REGISTER_IOWQ_AFF:
10854 ret = -EINVAL;
10855 if (!arg || !nr_args)
10856 break;
10857 ret = io_register_iowq_aff(ctx, arg, nr_args);
10858 break;
10859 case IORING_UNREGISTER_IOWQ_AFF:
10860 ret = -EINVAL;
10861 if (arg || nr_args)
10862 break;
10863 ret = io_unregister_iowq_aff(ctx);
10864 break;
2e480058
JA
10865 case IORING_REGISTER_IOWQ_MAX_WORKERS:
10866 ret = -EINVAL;
10867 if (!arg || nr_args != 2)
10868 break;
10869 ret = io_register_iowq_max_workers(ctx, arg);
10870 break;
edafccee
JA
10871 default:
10872 ret = -EINVAL;
10873 break;
10874 }
10875
071698e1 10876 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 10877 /* bring the ctx back to life */
05f3fb3c 10878 percpu_ref_reinit(&ctx->refs);
0f158b4c 10879 reinit_completion(&ctx->ref_comp);
05f3fb3c 10880 }
edafccee
JA
10881 return ret;
10882}
10883
10884SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
10885 void __user *, arg, unsigned int, nr_args)
10886{
10887 struct io_ring_ctx *ctx;
10888 long ret = -EBADF;
10889 struct fd f;
10890
10891 f = fdget(fd);
10892 if (!f.file)
10893 return -EBADF;
10894
10895 ret = -EOPNOTSUPP;
10896 if (f.file->f_op != &io_uring_fops)
10897 goto out_fput;
10898
10899 ctx = f.file->private_data;
10900
b6c23dd5
PB
10901 io_run_task_work();
10902
edafccee
JA
10903 mutex_lock(&ctx->uring_lock);
10904 ret = __io_uring_register(ctx, opcode, arg, nr_args);
10905 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
10906 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
10907 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
10908out_fput:
10909 fdput(f);
10910 return ret;
10911}
10912
2b188cc1
JA
10913static int __init io_uring_init(void)
10914{
d7f62e82
SM
10915#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
10916 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
10917 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
10918} while (0)
10919
10920#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
10921 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
10922 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
10923 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
10924 BUILD_BUG_SQE_ELEM(1, __u8, flags);
10925 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
10926 BUILD_BUG_SQE_ELEM(4, __s32, fd);
10927 BUILD_BUG_SQE_ELEM(8, __u64, off);
10928 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
10929 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 10930 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
10931 BUILD_BUG_SQE_ELEM(24, __u32, len);
10932 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
10933 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
10934 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
10935 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
5769a351
JX
10936 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
10937 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
d7f62e82
SM
10938 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
10939 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
10940 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
10941 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
10942 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
10943 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
10944 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
10945 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 10946 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
10947 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
10948 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
16340eab 10949 BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
d7f62e82 10950 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 10951 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
b9445598 10952 BUILD_BUG_SQE_ELEM(44, __u32, file_index);
d7f62e82 10953
b0d658ec
PB
10954 BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
10955 sizeof(struct io_uring_rsrc_update));
10956 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
10957 sizeof(struct io_uring_rsrc_update2));
90499ad0
PB
10958
10959 /* ->buf_index is u16 */
10960 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
10961
b0d658ec
PB
10962 /* should fit into one byte */
10963 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
68fe256a
PB
10964 BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
10965 BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
b0d658ec 10966
d3656344 10967 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
32c2d33e 10968 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
16340eab 10969
91f245d5
JA
10970 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
10971 SLAB_ACCOUNT);
2b188cc1
JA
10972 return 0;
10973};
10974__initcall(io_uring_init);