io_uring: consistent typing for issue_flags
[linux-block.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
d068b506 14 * through a control-dependency in io_get_cqe (smp_store_release to
1e84b97b
SB
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
52de1fe1 47#include <net/compat.h>
2b188cc1
JA
48#include <linux/refcount.h>
49#include <linux/uio.h>
6b47ee6e 50#include <linux/bits.h>
2b188cc1
JA
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
2b188cc1
JA
58#include <linux/percpu.h>
59#include <linux/slab.h>
2b188cc1 60#include <linux/blkdev.h>
edafccee 61#include <linux/bvec.h>
2b188cc1
JA
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
6b06314c 65#include <net/scm.h>
2b188cc1
JA
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
edafccee
JA
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
aa4c3967 72#include <linux/highmem.h>
15b71abe
JA
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
4840e418 75#include <linux/fadvise.h>
3e4827b0 76#include <linux/eventpoll.h>
7d67af2c 77#include <linux/splice.h>
b41e9852 78#include <linux/task_work.h>
bcf5a063 79#include <linux/pagemap.h>
0f212204 80#include <linux/io_uring.h>
ef98eb04 81#include <linux/tracehook.h>
2b188cc1 82
c826bd7a
DD
83#define CREATE_TRACE_POINTS
84#include <trace/events/io_uring.h>
85
2b188cc1
JA
86#include <uapi/linux/io_uring.h>
87
88#include "internal.h"
561fb04a 89#include "io-wq.h"
2b188cc1 90
5277deaa 91#define IORING_MAX_ENTRIES 32768
33a107f0 92#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
4ce8ad95 93#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
65e19f54 94
187f08c1 95/* only define max */
042b0d85 96#define IORING_MAX_FIXED_FILES (1U << 15)
21b55dbc
SG
97#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
98 IORING_REGISTER_LAST + IORING_OP_LAST)
2b188cc1 99
187f08c1 100#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
2d091d62
PB
101#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
102#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
103
489809e2
PB
104#define IORING_MAX_REG_BUFFERS (1U << 14)
105
68fe256a
PB
106#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
107 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
108
109#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
110
c854357b 111#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
d886e185
PB
112 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
113 REQ_F_ASYNC_DATA)
b16fed66 114
09899b19
PB
115#define IO_TCTX_REFS_CACHE_NR (1U << 10)
116
2b188cc1
JA
117struct io_uring {
118 u32 head ____cacheline_aligned_in_smp;
119 u32 tail ____cacheline_aligned_in_smp;
120};
121
1e84b97b 122/*
75b28aff
HV
123 * This data is shared with the application through the mmap at offsets
124 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
125 *
126 * The offsets to the member fields are published through struct
127 * io_sqring_offsets when calling io_uring_setup.
128 */
75b28aff 129struct io_rings {
1e84b97b
SB
130 /*
131 * Head and tail offsets into the ring; the offsets need to be
132 * masked to get valid indices.
133 *
75b28aff
HV
134 * The kernel controls head of the sq ring and the tail of the cq ring,
135 * and the application controls tail of the sq ring and the head of the
136 * cq ring.
1e84b97b 137 */
75b28aff 138 struct io_uring sq, cq;
1e84b97b 139 /*
75b28aff 140 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
141 * ring_entries - 1)
142 */
75b28aff
HV
143 u32 sq_ring_mask, cq_ring_mask;
144 /* Ring sizes (constant, power of 2) */
145 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
146 /*
147 * Number of invalid entries dropped by the kernel due to
148 * invalid index stored in array
149 *
150 * Written by the kernel, shouldn't be modified by the
151 * application (i.e. get number of "new events" by comparing to
152 * cached value).
153 *
154 * After a new SQ head value was read by the application this
155 * counter includes all submissions that were dropped reaching
156 * the new SQ head (and possibly more).
157 */
75b28aff 158 u32 sq_dropped;
1e84b97b 159 /*
0d9b5b3a 160 * Runtime SQ flags
1e84b97b
SB
161 *
162 * Written by the kernel, shouldn't be modified by the
163 * application.
164 *
165 * The application needs a full memory barrier before checking
166 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
167 */
75b28aff 168 u32 sq_flags;
0d9b5b3a
SG
169 /*
170 * Runtime CQ flags
171 *
172 * Written by the application, shouldn't be modified by the
173 * kernel.
174 */
fe7e3257 175 u32 cq_flags;
1e84b97b
SB
176 /*
177 * Number of completion events lost because the queue was full;
178 * this should be avoided by the application by making sure
0b4295b5 179 * there are not more requests pending than there is space in
1e84b97b
SB
180 * the completion queue.
181 *
182 * Written by the kernel, shouldn't be modified by the
183 * application (i.e. get number of "new events" by comparing to
184 * cached value).
185 *
186 * As completion events come in out of order this counter is not
187 * ordered with any other data.
188 */
75b28aff 189 u32 cq_overflow;
1e84b97b
SB
190 /*
191 * Ring buffer of completion events.
192 *
193 * The kernel writes completion events fresh every time they are
194 * produced, so the application is allowed to modify pending
195 * entries.
196 */
75b28aff 197 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
198};
199
45d189c6
PB
200enum io_uring_cmd_flags {
201 IO_URING_F_NONBLOCK = 1,
889fca73 202 IO_URING_F_COMPLETE_DEFER = 2,
45d189c6
PB
203};
204
edafccee
JA
205struct io_mapped_ubuf {
206 u64 ubuf;
4751f53d 207 u64 ubuf_end;
edafccee 208 unsigned int nr_bvecs;
de293938 209 unsigned long acct_pages;
41edf1a5 210 struct bio_vec bvec[];
edafccee
JA
211};
212
50238531
BM
213struct io_ring_ctx;
214
6c2450ae
PB
215struct io_overflow_cqe {
216 struct io_uring_cqe cqe;
217 struct list_head list;
218};
219
a04b0ac0
PB
220struct io_fixed_file {
221 /* file * with additional FFS_* flags */
222 unsigned long file_ptr;
223};
224
269bbe5f
BM
225struct io_rsrc_put {
226 struct list_head list;
b60c8dce 227 u64 tag;
50238531
BM
228 union {
229 void *rsrc;
230 struct file *file;
bd54b6fe 231 struct io_mapped_ubuf *buf;
50238531 232 };
269bbe5f
BM
233};
234
aeca241b 235struct io_file_table {
042b0d85 236 struct io_fixed_file *files;
31b51510
JA
237};
238
b895c9a6 239struct io_rsrc_node {
05589553
XW
240 struct percpu_ref refs;
241 struct list_head node;
269bbe5f 242 struct list_head rsrc_list;
b895c9a6 243 struct io_rsrc_data *rsrc_data;
4a38aed2 244 struct llist_node llist;
e297822b 245 bool done;
05589553
XW
246};
247
40ae0ff7
PB
248typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
249
b895c9a6 250struct io_rsrc_data {
05f3fb3c
JA
251 struct io_ring_ctx *ctx;
252
2d091d62
PB
253 u64 **tags;
254 unsigned int nr;
40ae0ff7 255 rsrc_put_fn *do_put;
3e942498 256 atomic_t refs;
05f3fb3c 257 struct completion done;
8bad28d8 258 bool quiesce;
05f3fb3c
JA
259};
260
5a2e745d
JA
261struct io_buffer {
262 struct list_head list;
263 __u64 addr;
d1f82808 264 __u32 len;
5a2e745d
JA
265 __u16 bid;
266};
267
21b55dbc
SG
268struct io_restriction {
269 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
270 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
271 u8 sqe_flags_allowed;
272 u8 sqe_flags_required;
7e84e1c7 273 bool registered;
21b55dbc
SG
274};
275
37d1e2e3
JA
276enum {
277 IO_SQ_THREAD_SHOULD_STOP = 0,
278 IO_SQ_THREAD_SHOULD_PARK,
279};
280
534ca6d6
JA
281struct io_sq_data {
282 refcount_t refs;
9e138a48 283 atomic_t park_pending;
09a6f4ef 284 struct mutex lock;
69fb2131
JA
285
286 /* ctx's that are using this sqd */
287 struct list_head ctx_list;
69fb2131 288
534ca6d6
JA
289 struct task_struct *thread;
290 struct wait_queue_head wait;
08369246
XW
291
292 unsigned sq_thread_idle;
37d1e2e3
JA
293 int sq_cpu;
294 pid_t task_pid;
5c2469e0 295 pid_t task_tgid;
37d1e2e3
JA
296
297 unsigned long state;
37d1e2e3 298 struct completion exited;
534ca6d6
JA
299};
300
6dd0be1e 301#define IO_COMPL_BATCH 32
6ff119a6 302#define IO_REQ_CACHE_SIZE 32
bf019da7 303#define IO_REQ_ALLOC_BATCH 8
258b29a9 304
a1ab7b35
PB
305struct io_submit_link {
306 struct io_kiocb *head;
307 struct io_kiocb *last;
308};
309
258b29a9 310struct io_submit_state {
5a158c6b
PB
311 /* inline/task_work completion list, under ->uring_lock */
312 struct io_wq_work_node free_list;
313 /* batch completion logic */
314 struct io_wq_work_list compl_reqs;
a1ab7b35 315 struct io_submit_link link;
258b29a9 316
258b29a9 317 bool plug_started;
4b628aeb 318 bool need_plug;
5a158c6b 319 struct blk_plug plug;
258b29a9
PB
320};
321
2b188cc1 322struct io_ring_ctx {
b52ecf8c 323 /* const or read-mostly hot data */
2b188cc1
JA
324 struct {
325 struct percpu_ref refs;
2b188cc1 326
b52ecf8c 327 struct io_rings *rings;
2b188cc1 328 unsigned int flags;
e1d85334 329 unsigned int compat: 1;
e1d85334
RD
330 unsigned int drain_next: 1;
331 unsigned int eventfd_async: 1;
21b55dbc 332 unsigned int restricted: 1;
f18ee4cf 333 unsigned int off_timeout_used: 1;
10c66904 334 unsigned int drain_active: 1;
b52ecf8c 335 } ____cacheline_aligned_in_smp;
2b188cc1 336
7f1129d2 337 /* submission data */
b52ecf8c 338 struct {
0499e582
PB
339 struct mutex uring_lock;
340
75b28aff
HV
341 /*
342 * Ring buffer of indices into array of io_uring_sqe, which is
343 * mmapped by the application using the IORING_OFF_SQES offset.
344 *
345 * This indirection could e.g. be used to assign fixed
346 * io_uring_sqe entries to operations and only submit them to
347 * the queue when needed.
348 *
349 * The kernel modifies neither the indices array nor the entries
350 * array.
351 */
352 u32 *sq_array;
c7af47cf 353 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
354 unsigned cached_sq_head;
355 unsigned sq_entries;
de0617e4 356 struct list_head defer_list;
7f1129d2
PB
357
358 /*
359 * Fixed resources fast path, should be accessed only under
360 * uring_lock, and updated through io_uring_register(2)
361 */
362 struct io_rsrc_node *rsrc_node;
ab409402 363 int rsrc_cached_refs;
7f1129d2
PB
364 struct io_file_table file_table;
365 unsigned nr_user_files;
366 unsigned nr_user_bufs;
367 struct io_mapped_ubuf **user_bufs;
368
369 struct io_submit_state submit_state;
5262f567 370 struct list_head timeout_list;
ef9dd637 371 struct list_head ltimeout_list;
1d7bb1d5 372 struct list_head cq_overflow_list;
7f1129d2
PB
373 struct xarray io_buffers;
374 struct xarray personalities;
375 u32 pers_next;
376 unsigned sq_thread_idle;
2b188cc1
JA
377 } ____cacheline_aligned_in_smp;
378
d0acdee2 379 /* IRQ completion list, under ->completion_lock */
c2b6c6bc 380 struct io_wq_work_list locked_free_list;
d0acdee2 381 unsigned int locked_free_nr;
3c1a2ead 382
7c30f36a 383 const struct cred *sq_creds; /* cred used for __io_sq_thread() */
534ca6d6
JA
384 struct io_sq_data *sq_data; /* if using sq thread polling */
385
90554200 386 struct wait_queue_head sqo_sq_wait;
69fb2131 387 struct list_head sqd_list;
75b28aff 388
5ed7a37d
PB
389 unsigned long check_cq_overflow;
390
206aefde
JA
391 struct {
392 unsigned cached_cq_tail;
393 unsigned cq_entries;
0499e582 394 struct eventfd_ctx *cq_ev_fd;
0499e582
PB
395 struct wait_queue_head cq_wait;
396 unsigned cq_extra;
397 atomic_t cq_timeouts;
0499e582 398 unsigned cq_last_tm_flush;
206aefde 399 } ____cacheline_aligned_in_smp;
2b188cc1 400
2b188cc1
JA
401 struct {
402 spinlock_t completion_lock;
e94f141b 403
89850fce
JA
404 spinlock_t timeout_lock;
405
def596e9 406 /*
540e32a0 407 * ->iopoll_list is protected by the ctx->uring_lock for
def596e9
JA
408 * io_uring instances that don't use IORING_SETUP_SQPOLL.
409 * For SQPOLL, only the single threaded io_sq_thread() will
410 * manipulate the list, hence no extra locking is needed there.
411 */
5eef4e87 412 struct io_wq_work_list iopoll_list;
78076bb6
JA
413 struct hlist_head *cancel_hash;
414 unsigned cancel_hash_bits;
915b3dde 415 bool poll_multi_queue;
2b188cc1 416 } ____cacheline_aligned_in_smp;
85faa7b8 417
21b55dbc 418 struct io_restriction restrictions;
3c1a2ead 419
b13a8918
PB
420 /* slow path rsrc auxilary data, used by update/register */
421 struct {
422 struct io_rsrc_node *rsrc_backup_node;
423 struct io_mapped_ubuf *dummy_ubuf;
424 struct io_rsrc_data *file_data;
425 struct io_rsrc_data *buf_data;
426
427 struct delayed_work rsrc_put_work;
428 struct llist_head rsrc_put_llist;
429 struct list_head rsrc_ref_list;
430 spinlock_t rsrc_ref_lock;
431 };
432
3c1a2ead 433 /* Keep this last, we don't need it for the fast path */
b986af7e
PB
434 struct {
435 #if defined(CONFIG_UNIX)
436 struct socket *ring_sock;
437 #endif
438 /* hashed buffered write serialization */
439 struct io_wq_hash *hash_map;
440
441 /* Only used for accounting purposes */
442 struct user_struct *user;
443 struct mm_struct *mm_account;
444
445 /* ctx exit and cancelation */
9011bf9a
PB
446 struct llist_head fallback_llist;
447 struct delayed_work fallback_work;
b986af7e
PB
448 struct work_struct exit_work;
449 struct list_head tctx_list;
450 struct completion ref_comp;
451 };
2b188cc1
JA
452};
453
53e043b2
SM
454struct io_uring_task {
455 /* submission side */
09899b19 456 int cached_refs;
53e043b2
SM
457 struct xarray xa;
458 struct wait_queue_head wait;
ee53fb2b
SM
459 const struct io_ring_ctx *last;
460 struct io_wq *io_wq;
53e043b2 461 struct percpu_counter inflight;
b303fe2e 462 atomic_t inflight_tracked;
53e043b2 463 atomic_t in_idle;
53e043b2
SM
464
465 spinlock_t task_lock;
466 struct io_wq_work_list task_list;
53e043b2 467 struct callback_head task_work;
6294f368 468 bool task_running;
53e043b2
SM
469};
470
09bb8394
JA
471/*
472 * First field must be the file pointer in all the
473 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
474 */
221c5eb2
JA
475struct io_poll_iocb {
476 struct file *file;
018043be 477 struct wait_queue_head *head;
221c5eb2 478 __poll_t events;
8c838788 479 bool done;
221c5eb2 480 bool canceled;
392edb45 481 struct wait_queue_entry wait;
221c5eb2
JA
482};
483
9d805892 484struct io_poll_update {
018043be 485 struct file *file;
9d805892
PB
486 u64 old_user_data;
487 u64 new_user_data;
488 __poll_t events;
b69de288
JA
489 bool update_events;
490 bool update_user_data;
018043be
PB
491};
492
b5dba59e
JA
493struct io_close {
494 struct file *file;
b5dba59e 495 int fd;
7df778be 496 u32 file_slot;
b5dba59e
JA
497};
498
ad8a48ac
JA
499struct io_timeout_data {
500 struct io_kiocb *req;
501 struct hrtimer timer;
502 struct timespec64 ts;
503 enum hrtimer_mode mode;
50c1df2b 504 u32 flags;
ad8a48ac
JA
505};
506
8ed8d3c3
JA
507struct io_accept {
508 struct file *file;
509 struct sockaddr __user *addr;
510 int __user *addr_len;
511 int flags;
aaa4db12 512 u32 file_slot;
09952e3e 513 unsigned long nofile;
8ed8d3c3
JA
514};
515
516struct io_sync {
517 struct file *file;
518 loff_t len;
519 loff_t off;
520 int flags;
d63d1b5e 521 int mode;
8ed8d3c3
JA
522};
523
fbf23849
JA
524struct io_cancel {
525 struct file *file;
526 u64 addr;
527};
528
b29472ee
JA
529struct io_timeout {
530 struct file *file;
bfe68a22
PB
531 u32 off;
532 u32 target_seq;
135fcde8 533 struct list_head list;
90cd7e42
PB
534 /* head of the link, used by linked timeouts only */
535 struct io_kiocb *head;
89b263f6
JA
536 /* for linked completions */
537 struct io_kiocb *prev;
b29472ee
JA
538};
539
0bdf7a2d
PB
540struct io_timeout_rem {
541 struct file *file;
542 u64 addr;
9c8e11b3
PB
543
544 /* timeout update */
545 struct timespec64 ts;
546 u32 flags;
f1042b6c 547 bool ltimeout;
0bdf7a2d
PB
548};
549
9adbd45d
JA
550struct io_rw {
551 /* NOTE: kiocb has the file as the first member, so don't do it here */
552 struct kiocb kiocb;
553 u64 addr;
554 u64 len;
555};
556
3fbb51c1
JA
557struct io_connect {
558 struct file *file;
559 struct sockaddr __user *addr;
560 int addr_len;
561};
562
e47293fd
JA
563struct io_sr_msg {
564 struct file *file;
fddaface 565 union {
4af3417a
PB
566 struct compat_msghdr __user *umsg_compat;
567 struct user_msghdr __user *umsg;
568 void __user *buf;
fddaface 569 };
e47293fd 570 int msg_flags;
bcda7baa 571 int bgid;
fddaface 572 size_t len;
e47293fd
JA
573};
574
15b71abe
JA
575struct io_open {
576 struct file *file;
577 int dfd;
b9445598 578 u32 file_slot;
15b71abe 579 struct filename *filename;
c12cedf2 580 struct open_how how;
4022e7af 581 unsigned long nofile;
15b71abe
JA
582};
583
269bbe5f 584struct io_rsrc_update {
05f3fb3c
JA
585 struct file *file;
586 u64 arg;
587 u32 nr_args;
588 u32 offset;
589};
590
4840e418
JA
591struct io_fadvise {
592 struct file *file;
593 u64 offset;
594 u32 len;
595 u32 advice;
596};
597
c1ca757b
JA
598struct io_madvise {
599 struct file *file;
600 u64 addr;
601 u32 len;
602 u32 advice;
603};
604
3e4827b0
JA
605struct io_epoll {
606 struct file *file;
607 int epfd;
608 int op;
609 int fd;
610 struct epoll_event event;
e47293fd
JA
611};
612
7d67af2c
PB
613struct io_splice {
614 struct file *file_out;
615 struct file *file_in;
616 loff_t off_out;
617 loff_t off_in;
618 u64 len;
619 unsigned int flags;
620};
621
ddf0322d
JA
622struct io_provide_buf {
623 struct file *file;
624 __u64 addr;
38134ada 625 __u32 len;
ddf0322d
JA
626 __u32 bgid;
627 __u16 nbufs;
628 __u16 bid;
629};
630
1d9e1288
BM
631struct io_statx {
632 struct file *file;
633 int dfd;
634 unsigned int mask;
635 unsigned int flags;
e62753e4 636 const char __user *filename;
1d9e1288
BM
637 struct statx __user *buffer;
638};
639
36f4fa68
JA
640struct io_shutdown {
641 struct file *file;
642 int how;
643};
644
80a261fd
JA
645struct io_rename {
646 struct file *file;
647 int old_dfd;
648 int new_dfd;
649 struct filename *oldpath;
650 struct filename *newpath;
651 int flags;
652};
653
14a1143b
JA
654struct io_unlink {
655 struct file *file;
656 int dfd;
657 int flags;
658 struct filename *filename;
659};
660
e34a02dc
DK
661struct io_mkdir {
662 struct file *file;
663 int dfd;
664 umode_t mode;
665 struct filename *filename;
666};
667
7a8721f8
DK
668struct io_symlink {
669 struct file *file;
670 int new_dfd;
671 struct filename *oldpath;
672 struct filename *newpath;
673};
674
cf30da90
DK
675struct io_hardlink {
676 struct file *file;
677 int old_dfd;
678 int new_dfd;
679 struct filename *oldpath;
680 struct filename *newpath;
681 int flags;
682};
683
f499a021
JA
684struct io_async_connect {
685 struct sockaddr_storage address;
686};
687
03b1230c
JA
688struct io_async_msghdr {
689 struct iovec fast_iov[UIO_FASTIOV];
257e84a5
PB
690 /* points to an allocated iov, if NULL we use fast_iov instead */
691 struct iovec *free_iov;
03b1230c
JA
692 struct sockaddr __user *uaddr;
693 struct msghdr msg;
b537916c 694 struct sockaddr_storage addr;
03b1230c
JA
695};
696
f67676d1
JA
697struct io_async_rw {
698 struct iovec fast_iov[UIO_FASTIOV];
ff6165b2
JA
699 const struct iovec *free_iovec;
700 struct iov_iter iter;
cd658695 701 struct iov_iter_state iter_state;
227c0c96 702 size_t bytes_done;
bcf5a063 703 struct wait_page_queue wpq;
f67676d1
JA
704};
705
6b47ee6e
PB
706enum {
707 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
708 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
709 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
710 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
711 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
bcda7baa 712 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
6b47ee6e 713
dddca226 714 /* first byte is taken by user flags, shift it to not overlap */
93d2bcd2 715 REQ_F_FAIL_BIT = 8,
6b47ee6e
PB
716 REQ_F_INFLIGHT_BIT,
717 REQ_F_CUR_POS_BIT,
718 REQ_F_NOWAIT_BIT,
6b47ee6e 719 REQ_F_LINK_TIMEOUT_BIT,
99bc4c38 720 REQ_F_NEED_CLEANUP_BIT,
d7718a9d 721 REQ_F_POLLED_BIT,
bcda7baa 722 REQ_F_BUFFER_SELECTED_BIT,
e342c807 723 REQ_F_COMPLETE_INLINE_BIT,
230d50d4 724 REQ_F_REISSUE_BIT,
b8e64b53 725 REQ_F_CREDS_BIT,
20e60a38 726 REQ_F_REFCOUNT_BIT,
4d13d1a4 727 REQ_F_ARM_LTIMEOUT_BIT,
d886e185 728 REQ_F_ASYNC_DATA_BIT,
7b29f92d 729 /* keep async read/write and isreg together and in order */
b191e2df
PB
730 REQ_F_NOWAIT_READ_BIT,
731 REQ_F_NOWAIT_WRITE_BIT,
7b29f92d 732 REQ_F_ISREG_BIT,
84557871
JA
733
734 /* not a real bit, just to check we're not overflowing the space */
735 __REQ_F_LAST_BIT,
6b47ee6e
PB
736};
737
738enum {
739 /* ctx owns file */
740 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
741 /* drain existing IO first */
742 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
743 /* linked sqes */
744 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
745 /* doesn't sever on completion < 0 */
746 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
747 /* IOSQE_ASYNC */
748 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
bcda7baa
JA
749 /* IOSQE_BUFFER_SELECT */
750 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
6b47ee6e 751
6b47ee6e 752 /* fail rest of links */
93d2bcd2 753 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
b05a1bcd 754 /* on inflight list, should be cancelled and waited on exit reliably */
6b47ee6e
PB
755 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
756 /* read/write uses file position */
757 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
758 /* must not punt to workers */
759 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
900fad45 760 /* has or had linked timeout */
6b47ee6e 761 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
99bc4c38
PB
762 /* needs cleanup */
763 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
d7718a9d
JA
764 /* already went through poll handler */
765 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
bcda7baa
JA
766 /* buffer already selected */
767 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
e342c807
PB
768 /* completion is deferred through io_comp_state */
769 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
230d50d4
JA
770 /* caller should reissue async */
771 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
7b29f92d 772 /* supports async reads */
b191e2df 773 REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
7b29f92d 774 /* supports async writes */
b191e2df 775 REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
7b29f92d
JA
776 /* regular file */
777 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
b8e64b53
PB
778 /* has creds assigned */
779 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
20e60a38
PB
780 /* skip refcounting if not set */
781 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
4d13d1a4
PB
782 /* there is a linked timeout that has to be armed */
783 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
d886e185
PB
784 /* ->async_data allocated */
785 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
d7718a9d
JA
786};
787
788struct async_poll {
789 struct io_poll_iocb poll;
807abcb0 790 struct io_poll_iocb *double_poll;
6b47ee6e
PB
791};
792
f237c30a 793typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
5b0a6acc 794
7cbf1722 795struct io_task_work {
5b0a6acc
PB
796 union {
797 struct io_wq_work_node node;
798 struct llist_node fallback_node;
799 };
800 io_req_tw_func_t func;
7cbf1722
JA
801};
802
992da01a
PB
803enum {
804 IORING_RSRC_FILE = 0,
805 IORING_RSRC_BUFFER = 1,
806};
807
09bb8394
JA
808/*
809 * NOTE! Each of the iocb union members has the file pointer
810 * as the first entry in their struct definition. So you can
811 * access the file pointer through any of the sub-structs,
812 * or directly as just 'ki_filp' in this struct.
813 */
2b188cc1 814struct io_kiocb {
221c5eb2 815 union {
09bb8394 816 struct file *file;
9adbd45d 817 struct io_rw rw;
221c5eb2 818 struct io_poll_iocb poll;
9d805892 819 struct io_poll_update poll_update;
8ed8d3c3
JA
820 struct io_accept accept;
821 struct io_sync sync;
fbf23849 822 struct io_cancel cancel;
b29472ee 823 struct io_timeout timeout;
0bdf7a2d 824 struct io_timeout_rem timeout_rem;
3fbb51c1 825 struct io_connect connect;
e47293fd 826 struct io_sr_msg sr_msg;
15b71abe 827 struct io_open open;
b5dba59e 828 struct io_close close;
269bbe5f 829 struct io_rsrc_update rsrc_update;
4840e418 830 struct io_fadvise fadvise;
c1ca757b 831 struct io_madvise madvise;
3e4827b0 832 struct io_epoll epoll;
7d67af2c 833 struct io_splice splice;
ddf0322d 834 struct io_provide_buf pbuf;
1d9e1288 835 struct io_statx statx;
36f4fa68 836 struct io_shutdown shutdown;
80a261fd 837 struct io_rename rename;
14a1143b 838 struct io_unlink unlink;
e34a02dc 839 struct io_mkdir mkdir;
7a8721f8 840 struct io_symlink symlink;
cf30da90 841 struct io_hardlink hardlink;
221c5eb2 842 };
2b188cc1 843
d625c6ee 844 u8 opcode;
65a6543d
XW
845 /* polled IO has completed */
846 u8 iopoll_completed;
4f4eeba8 847 u16 buf_index;
d17e56eb
PB
848 unsigned int flags;
849
850 u64 user_data;
9cf7c104 851 u32 result;
d17e56eb 852 u32 cflags;
4f4eeba8 853
010e8e6b 854 struct io_ring_ctx *ctx;
010e8e6b 855 struct task_struct *task;
d7718a9d 856
269bbe5f 857 struct percpu_ref *fixed_rsrc_refs;
d886e185
PB
858 /* store used ubuf, so we can prevent reloading */
859 struct io_mapped_ubuf *imu;
fcb323cc 860
7e3709d5 861 /* used by request caches, completion batching and iopoll */
ef05d9eb 862 struct io_wq_work_node comp_list;
d17e56eb 863 atomic_t refs;
7e3709d5 864 struct io_kiocb *link;
5b0a6acc 865 struct io_task_work io_task_work;
010e8e6b
PB
866 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
867 struct hlist_node hash_node;
7e3709d5 868 /* internal polling, see IORING_FEAT_FAST_POLL */
010e8e6b 869 struct async_poll *apoll;
d886e185
PB
870 /* opcode allocated if it needs to store data for async defer */
871 void *async_data;
ef05d9eb 872 struct io_wq_work work;
7e3709d5 873 /* custom credentials, valid IFF REQ_F_CREDS is set */
ef05d9eb 874 const struct cred *creds;
7e3709d5 875 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
30d51dd4 876 struct io_buffer *kbuf;
2b188cc1 877};
05589553 878
13bf43f5
PB
879struct io_tctx_node {
880 struct list_head ctx_node;
881 struct task_struct *task;
13bf43f5
PB
882 struct io_ring_ctx *ctx;
883};
884
27dc8338
PB
885struct io_defer_entry {
886 struct list_head list;
887 struct io_kiocb *req;
9cf7c104 888 u32 seq;
2b188cc1
JA
889};
890
d3656344 891struct io_op_def {
d3656344
JA
892 /* needs req->file assigned */
893 unsigned needs_file : 1;
6d63416d
PB
894 /* should block plug */
895 unsigned plug : 1;
d3656344
JA
896 /* hash wq insertion if file is a regular file */
897 unsigned hash_reg_file : 1;
898 /* unbound wq insertion if file is a non-regular file */
899 unsigned unbound_nonreg_file : 1;
8a72758c
JA
900 /* set if opcode supports polled "wait" */
901 unsigned pollin : 1;
902 unsigned pollout : 1;
bcda7baa
JA
903 /* op supports buffer selection */
904 unsigned buffer_select : 1;
26f0505a
PB
905 /* do prep async if is going to be punted */
906 unsigned needs_async_setup : 1;
6d63416d
PB
907 /* opcode is not supported by this kernel */
908 unsigned not_supported : 1;
e8c2bc1f
JA
909 /* size of async data needed, if any */
910 unsigned short async_size;
d3656344
JA
911};
912
0918682b 913static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
914 [IORING_OP_NOP] = {},
915 [IORING_OP_READV] = {
d3656344
JA
916 .needs_file = 1,
917 .unbound_nonreg_file = 1,
8a72758c 918 .pollin = 1,
4d954c25 919 .buffer_select = 1,
26f0505a 920 .needs_async_setup = 1,
27926b68 921 .plug = 1,
e8c2bc1f 922 .async_size = sizeof(struct io_async_rw),
d3656344 923 },
0463b6c5 924 [IORING_OP_WRITEV] = {
d3656344
JA
925 .needs_file = 1,
926 .hash_reg_file = 1,
927 .unbound_nonreg_file = 1,
8a72758c 928 .pollout = 1,
26f0505a 929 .needs_async_setup = 1,
27926b68 930 .plug = 1,
e8c2bc1f 931 .async_size = sizeof(struct io_async_rw),
d3656344 932 },
0463b6c5 933 [IORING_OP_FSYNC] = {
d3656344
JA
934 .needs_file = 1,
935 },
0463b6c5 936 [IORING_OP_READ_FIXED] = {
d3656344
JA
937 .needs_file = 1,
938 .unbound_nonreg_file = 1,
8a72758c 939 .pollin = 1,
27926b68 940 .plug = 1,
e8c2bc1f 941 .async_size = sizeof(struct io_async_rw),
d3656344 942 },
0463b6c5 943 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
944 .needs_file = 1,
945 .hash_reg_file = 1,
946 .unbound_nonreg_file = 1,
8a72758c 947 .pollout = 1,
27926b68 948 .plug = 1,
e8c2bc1f 949 .async_size = sizeof(struct io_async_rw),
d3656344 950 },
0463b6c5 951 [IORING_OP_POLL_ADD] = {
d3656344
JA
952 .needs_file = 1,
953 .unbound_nonreg_file = 1,
954 },
0463b6c5
PB
955 [IORING_OP_POLL_REMOVE] = {},
956 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344
JA
957 .needs_file = 1,
958 },
0463b6c5 959 [IORING_OP_SENDMSG] = {
d3656344
JA
960 .needs_file = 1,
961 .unbound_nonreg_file = 1,
8a72758c 962 .pollout = 1,
26f0505a 963 .needs_async_setup = 1,
e8c2bc1f 964 .async_size = sizeof(struct io_async_msghdr),
d3656344 965 },
0463b6c5 966 [IORING_OP_RECVMSG] = {
d3656344
JA
967 .needs_file = 1,
968 .unbound_nonreg_file = 1,
8a72758c 969 .pollin = 1,
52de1fe1 970 .buffer_select = 1,
26f0505a 971 .needs_async_setup = 1,
e8c2bc1f 972 .async_size = sizeof(struct io_async_msghdr),
d3656344 973 },
0463b6c5 974 [IORING_OP_TIMEOUT] = {
e8c2bc1f 975 .async_size = sizeof(struct io_timeout_data),
d3656344 976 },
9c8e11b3
PB
977 [IORING_OP_TIMEOUT_REMOVE] = {
978 /* used by timeout updates' prep() */
9c8e11b3 979 },
0463b6c5 980 [IORING_OP_ACCEPT] = {
d3656344
JA
981 .needs_file = 1,
982 .unbound_nonreg_file = 1,
8a72758c 983 .pollin = 1,
d3656344 984 },
0463b6c5
PB
985 [IORING_OP_ASYNC_CANCEL] = {},
986 [IORING_OP_LINK_TIMEOUT] = {
e8c2bc1f 987 .async_size = sizeof(struct io_timeout_data),
d3656344 988 },
0463b6c5 989 [IORING_OP_CONNECT] = {
d3656344
JA
990 .needs_file = 1,
991 .unbound_nonreg_file = 1,
8a72758c 992 .pollout = 1,
26f0505a 993 .needs_async_setup = 1,
e8c2bc1f 994 .async_size = sizeof(struct io_async_connect),
d3656344 995 },
0463b6c5 996 [IORING_OP_FALLOCATE] = {
d3656344 997 .needs_file = 1,
d3656344 998 },
44526bed
JA
999 [IORING_OP_OPENAT] = {},
1000 [IORING_OP_CLOSE] = {},
1001 [IORING_OP_FILES_UPDATE] = {},
1002 [IORING_OP_STATX] = {},
0463b6c5 1003 [IORING_OP_READ] = {
3a6820f2
JA
1004 .needs_file = 1,
1005 .unbound_nonreg_file = 1,
8a72758c 1006 .pollin = 1,
bcda7baa 1007 .buffer_select = 1,
27926b68 1008 .plug = 1,
e8c2bc1f 1009 .async_size = sizeof(struct io_async_rw),
3a6820f2 1010 },
0463b6c5 1011 [IORING_OP_WRITE] = {
3a6820f2 1012 .needs_file = 1,
7b3188e7 1013 .hash_reg_file = 1,
3a6820f2 1014 .unbound_nonreg_file = 1,
8a72758c 1015 .pollout = 1,
27926b68 1016 .plug = 1,
e8c2bc1f 1017 .async_size = sizeof(struct io_async_rw),
3a6820f2 1018 },
0463b6c5 1019 [IORING_OP_FADVISE] = {
4840e418 1020 .needs_file = 1,
c1ca757b 1021 },
44526bed 1022 [IORING_OP_MADVISE] = {},
0463b6c5 1023 [IORING_OP_SEND] = {
fddaface
JA
1024 .needs_file = 1,
1025 .unbound_nonreg_file = 1,
8a72758c 1026 .pollout = 1,
fddaface 1027 },
0463b6c5 1028 [IORING_OP_RECV] = {
fddaface
JA
1029 .needs_file = 1,
1030 .unbound_nonreg_file = 1,
8a72758c 1031 .pollin = 1,
bcda7baa 1032 .buffer_select = 1,
fddaface 1033 },
0463b6c5 1034 [IORING_OP_OPENAT2] = {
cebdb986 1035 },
3e4827b0
JA
1036 [IORING_OP_EPOLL_CTL] = {
1037 .unbound_nonreg_file = 1,
3e4827b0 1038 },
7d67af2c
PB
1039 [IORING_OP_SPLICE] = {
1040 .needs_file = 1,
1041 .hash_reg_file = 1,
1042 .unbound_nonreg_file = 1,
ddf0322d
JA
1043 },
1044 [IORING_OP_PROVIDE_BUFFERS] = {},
067524e9 1045 [IORING_OP_REMOVE_BUFFERS] = {},
f2a8d5c7
PB
1046 [IORING_OP_TEE] = {
1047 .needs_file = 1,
1048 .hash_reg_file = 1,
1049 .unbound_nonreg_file = 1,
1050 },
36f4fa68
JA
1051 [IORING_OP_SHUTDOWN] = {
1052 .needs_file = 1,
1053 },
44526bed
JA
1054 [IORING_OP_RENAMEAT] = {},
1055 [IORING_OP_UNLINKAT] = {},
e34a02dc 1056 [IORING_OP_MKDIRAT] = {},
7a8721f8 1057 [IORING_OP_SYMLINKAT] = {},
cf30da90 1058 [IORING_OP_LINKAT] = {},
d3656344
JA
1059};
1060
0756a869
PB
1061/* requests with any of those set should undergo io_disarm_next() */
1062#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1063
7a612350 1064static bool io_disarm_next(struct io_kiocb *req);
eef51daa 1065static void io_uring_del_tctx_node(unsigned long index);
9936c7c2
PB
1066static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1067 struct task_struct *task,
3dd0c97a 1068 bool cancel_all);
78cc687b 1069static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1ffc5422 1070
d4d19c19 1071static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 1072 s32 res, u32 cflags);
ec9c02ad 1073static void io_put_req(struct io_kiocb *req);
91c2f697 1074static void io_put_req_deferred(struct io_kiocb *req);
c7dae4ba 1075static void io_dismantle_req(struct io_kiocb *req);
94ae5e77 1076static void io_queue_linked_timeout(struct io_kiocb *req);
fdecb662 1077static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 1078 struct io_uring_rsrc_update2 *up,
98f0b3b4 1079 unsigned nr_args);
68fb8979 1080static void io_clean_op(struct io_kiocb *req);
ac177053 1081static struct file *io_file_get(struct io_ring_ctx *ctx,
8371adf5 1082 struct io_kiocb *req, int fd, bool fixed);
c5eef2b9 1083static void __io_queue_sqe(struct io_kiocb *req);
269bbe5f 1084static void io_rsrc_put_work(struct work_struct *work);
de0617e4 1085
907d1df3 1086static void io_req_task_queue(struct io_kiocb *req);
c450178d 1087static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
179ae0d1 1088static int io_req_prep_async(struct io_kiocb *req);
de0617e4 1089
b9445598
PB
1090static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1091 unsigned int issue_flags, u32 slot_index);
7df778be
PB
1092static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1093
f1042b6c 1094static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
b9445598 1095
2b188cc1
JA
1096static struct kmem_cache *req_cachep;
1097
0918682b 1098static const struct file_operations io_uring_fops;
2b188cc1
JA
1099
1100struct sock *io_uring_get_socket(struct file *file)
1101{
1102#if defined(CONFIG_UNIX)
1103 if (file->f_op == &io_uring_fops) {
1104 struct io_ring_ctx *ctx = file->private_data;
1105
1106 return ctx->ring_sock->sk;
1107 }
1108#endif
1109 return NULL;
1110}
1111EXPORT_SYMBOL(io_uring_get_socket);
1112
f237c30a
PB
1113static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1114{
1115 if (!*locked) {
1116 mutex_lock(&ctx->uring_lock);
1117 *locked = true;
1118 }
1119}
1120
f2f87370
PB
1121#define io_for_each_link(pos, head) \
1122 for (pos = (head); pos; pos = pos->link)
1123
21c843d5
PB
1124/*
1125 * Shamelessly stolen from the mm implementation of page reference checking,
1126 * see commit f958d7b528b1 for details.
1127 */
1128#define req_ref_zero_or_close_to_overflow(req) \
1129 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1130
1131static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1132{
20e60a38 1133 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1134 return atomic_inc_not_zero(&req->refs);
1135}
1136
21c843d5
PB
1137static inline bool req_ref_put_and_test(struct io_kiocb *req)
1138{
20e60a38
PB
1139 if (likely(!(req->flags & REQ_F_REFCOUNT)))
1140 return true;
1141
21c843d5
PB
1142 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1143 return atomic_dec_and_test(&req->refs);
1144}
1145
1146static inline void req_ref_put(struct io_kiocb *req)
1147{
20e60a38 1148 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1149 WARN_ON_ONCE(req_ref_put_and_test(req));
1150}
1151
1152static inline void req_ref_get(struct io_kiocb *req)
1153{
20e60a38 1154 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1155 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1156 atomic_inc(&req->refs);
1157}
1158
c450178d
PB
1159static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1160{
6f33b0bc 1161 if (!wq_list_empty(&ctx->submit_state.compl_reqs))
c450178d
PB
1162 __io_submit_flush_completions(ctx);
1163}
1164
48dcd38d 1165static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
20e60a38
PB
1166{
1167 if (!(req->flags & REQ_F_REFCOUNT)) {
1168 req->flags |= REQ_F_REFCOUNT;
48dcd38d 1169 atomic_set(&req->refs, nr);
20e60a38
PB
1170 }
1171}
1172
48dcd38d
PB
1173static inline void io_req_set_refcount(struct io_kiocb *req)
1174{
1175 __io_req_set_refcount(req, 1);
1176}
1177
ab409402
PB
1178#define IO_RSRC_REF_BATCH 100
1179
1180static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
1181 struct io_ring_ctx *ctx)
1182 __must_hold(&ctx->uring_lock)
1183{
1184 struct percpu_ref *ref = req->fixed_rsrc_refs;
1185
1186 if (ref) {
1187 if (ref == &ctx->rsrc_node->refs)
1188 ctx->rsrc_cached_refs++;
1189 else
1190 percpu_ref_put(ref);
1191 }
1192}
1193
1194static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
1195{
1196 if (req->fixed_rsrc_refs)
1197 percpu_ref_put(req->fixed_rsrc_refs);
1198}
1199
1200static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
1201 __must_hold(&ctx->uring_lock)
1202{
1203 if (ctx->rsrc_cached_refs) {
1204 percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
1205 ctx->rsrc_cached_refs = 0;
1206 }
1207}
1208
1209static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
1210 __must_hold(&ctx->uring_lock)
1211{
1212 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
1213 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
1214}
1215
a46be971
PB
1216static inline void io_req_set_rsrc_node(struct io_kiocb *req,
1217 struct io_ring_ctx *ctx)
36f72fe2 1218{
269bbe5f 1219 if (!req->fixed_rsrc_refs) {
a7f0ed5a 1220 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
ab409402
PB
1221 ctx->rsrc_cached_refs--;
1222 if (unlikely(ctx->rsrc_cached_refs < 0))
1223 io_rsrc_refs_refill(ctx);
36f72fe2
PB
1224 }
1225}
1226
f70865db
PB
1227static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1228{
1229 bool got = percpu_ref_tryget(ref);
1230
1231 /* already at zero, wait for ->release() */
1232 if (!got)
1233 wait_for_completion(compl);
1234 percpu_ref_resurrect(ref);
1235 if (got)
1236 percpu_ref_put(ref);
1237}
1238
3dd0c97a
PB
1239static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1240 bool cancel_all)
08d23634
PB
1241{
1242 struct io_kiocb *req;
1243
68207680 1244 if (task && head->task != task)
08d23634 1245 return false;
3dd0c97a 1246 if (cancel_all)
08d23634
PB
1247 return true;
1248
1249 io_for_each_link(req, head) {
b05a1bcd 1250 if (req->flags & REQ_F_INFLIGHT)
02a13674 1251 return true;
08d23634
PB
1252 }
1253 return false;
1254}
1255
d886e185
PB
1256static inline bool req_has_async_data(struct io_kiocb *req)
1257{
1258 return req->flags & REQ_F_ASYNC_DATA;
1259}
1260
93d2bcd2 1261static inline void req_set_fail(struct io_kiocb *req)
c40f6379 1262{
93d2bcd2 1263 req->flags |= REQ_F_FAIL;
c40f6379 1264}
4a38aed2 1265
a8295b98
HX
1266static inline void req_fail_link_node(struct io_kiocb *req, int res)
1267{
1268 req_set_fail(req);
1269 req->result = res;
1270}
1271
c072481d 1272static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
2b188cc1
JA
1273{
1274 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1275
0f158b4c 1276 complete(&ctx->ref_comp);
2b188cc1
JA
1277}
1278
8eb7e2d0
PB
1279static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1280{
1281 return !req->timeout.off;
1282}
1283
c072481d 1284static __cold void io_fallback_req_func(struct work_struct *work)
f56165e6
PB
1285{
1286 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1287 fallback_work.work);
1288 struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1289 struct io_kiocb *req, *tmp;
f237c30a 1290 bool locked = false;
f56165e6
PB
1291
1292 percpu_ref_get(&ctx->refs);
1293 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
f237c30a 1294 req->io_task_work.func(req, &locked);
5636c00d 1295
f237c30a 1296 if (locked) {
c450178d 1297 io_submit_flush_completions(ctx);
f237c30a
PB
1298 mutex_unlock(&ctx->uring_lock);
1299 }
f56165e6
PB
1300 percpu_ref_put(&ctx->refs);
1301}
1302
c072481d 1303static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
2b188cc1
JA
1304{
1305 struct io_ring_ctx *ctx;
78076bb6 1306 int hash_bits;
2b188cc1
JA
1307
1308 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1309 if (!ctx)
1310 return NULL;
1311
78076bb6
JA
1312 /*
1313 * Use 5 bits less than the max cq entries, that should give us around
1314 * 32 entries per hash list if totally full and uniformly spread.
1315 */
1316 hash_bits = ilog2(p->cq_entries);
1317 hash_bits -= 5;
1318 if (hash_bits <= 0)
1319 hash_bits = 1;
1320 ctx->cancel_hash_bits = hash_bits;
1321 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1322 GFP_KERNEL);
1323 if (!ctx->cancel_hash)
1324 goto err;
1325 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1326
6224843d
PB
1327 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1328 if (!ctx->dummy_ubuf)
1329 goto err;
1330 /* set invalid range, so io_import_fixed() fails meeting it */
1331 ctx->dummy_ubuf->ubuf = -1UL;
1332
21482896 1333 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
1334 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1335 goto err;
2b188cc1
JA
1336
1337 ctx->flags = p->flags;
90554200 1338 init_waitqueue_head(&ctx->sqo_sq_wait);
69fb2131 1339 INIT_LIST_HEAD(&ctx->sqd_list);
1d7bb1d5 1340 INIT_LIST_HEAD(&ctx->cq_overflow_list);
0f158b4c 1341 init_completion(&ctx->ref_comp);
9e15c3a0 1342 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
61cf9370 1343 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
2b188cc1 1344 mutex_init(&ctx->uring_lock);
311997b3 1345 init_waitqueue_head(&ctx->cq_wait);
2b188cc1 1346 spin_lock_init(&ctx->completion_lock);
89850fce 1347 spin_lock_init(&ctx->timeout_lock);
5eef4e87 1348 INIT_WQ_LIST(&ctx->iopoll_list);
de0617e4 1349 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 1350 INIT_LIST_HEAD(&ctx->timeout_list);
ef9dd637 1351 INIT_LIST_HEAD(&ctx->ltimeout_list);
d67d2263
BM
1352 spin_lock_init(&ctx->rsrc_ref_lock);
1353 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
269bbe5f
BM
1354 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1355 init_llist_head(&ctx->rsrc_put_llist);
13bf43f5 1356 INIT_LIST_HEAD(&ctx->tctx_list);
c2b6c6bc
PB
1357 ctx->submit_state.free_list.next = NULL;
1358 INIT_WQ_LIST(&ctx->locked_free_list);
9011bf9a 1359 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
6f33b0bc 1360 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
2b188cc1 1361 return ctx;
206aefde 1362err:
6224843d 1363 kfree(ctx->dummy_ubuf);
78076bb6 1364 kfree(ctx->cancel_hash);
206aefde
JA
1365 kfree(ctx);
1366 return NULL;
2b188cc1
JA
1367}
1368
8f6ed49a
PB
1369static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1370{
1371 struct io_rings *r = ctx->rings;
1372
1373 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1374 ctx->cq_extra--;
1375}
1376
9cf7c104 1377static bool req_need_defer(struct io_kiocb *req, u32 seq)
7adf4eaf 1378{
2bc9930e
JA
1379 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1380 struct io_ring_ctx *ctx = req->ctx;
a197f664 1381
8f6ed49a 1382 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
2bc9930e 1383 }
de0617e4 1384
9d858b21 1385 return false;
de0617e4
JA
1386}
1387
c97d8a0f
PB
1388#define FFS_ASYNC_READ 0x1UL
1389#define FFS_ASYNC_WRITE 0x2UL
1390#ifdef CONFIG_64BIT
1391#define FFS_ISREG 0x4UL
1392#else
1393#define FFS_ISREG 0x0UL
1394#endif
1395#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
1396
1397static inline bool io_req_ffs_set(struct io_kiocb *req)
1398{
1399 return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
1400}
1401
c072481d 1402static inline void io_req_track_inflight(struct io_kiocb *req)
ce3d5aae 1403{
ce3d5aae 1404 if (!(req->flags & REQ_F_INFLIGHT)) {
ce3d5aae 1405 req->flags |= REQ_F_INFLIGHT;
b303fe2e 1406 atomic_inc(&current->io_uring->inflight_tracked);
ce3d5aae
PB
1407 }
1408}
1409
906c6caa
PB
1410static inline void io_unprep_linked_timeout(struct io_kiocb *req)
1411{
1412 req->flags &= ~REQ_F_LINK_TIMEOUT;
1413}
1414
fd08e530
PB
1415static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1416{
906c6caa
PB
1417 if (WARN_ON_ONCE(!req->link))
1418 return NULL;
1419
4d13d1a4
PB
1420 req->flags &= ~REQ_F_ARM_LTIMEOUT;
1421 req->flags |= REQ_F_LINK_TIMEOUT;
fd08e530
PB
1422
1423 /* linked timeouts should have two refs once prep'ed */
48dcd38d 1424 io_req_set_refcount(req);
4d13d1a4
PB
1425 __io_req_set_refcount(req->link, 2);
1426 return req->link;
fd08e530
PB
1427}
1428
1429static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1430{
4d13d1a4 1431 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
fd08e530
PB
1432 return NULL;
1433 return __io_prep_linked_timeout(req);
1434}
1435
1e6fa521
JA
1436static void io_prep_async_work(struct io_kiocb *req)
1437{
1438 const struct io_op_def *def = &io_op_defs[req->opcode];
1e6fa521
JA
1439 struct io_ring_ctx *ctx = req->ctx;
1440
b8e64b53
PB
1441 if (!(req->flags & REQ_F_CREDS)) {
1442 req->flags |= REQ_F_CREDS;
c10d1f98 1443 req->creds = get_current_cred();
b8e64b53 1444 }
003e8dcc 1445
e1d675df
PB
1446 req->work.list.next = NULL;
1447 req->work.flags = 0;
feaadc4f
PB
1448 if (req->flags & REQ_F_FORCE_ASYNC)
1449 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1450
1e6fa521
JA
1451 if (req->flags & REQ_F_ISREG) {
1452 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1453 io_wq_hash_work(&req->work, file_inode(req->file));
4b982bd0 1454 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1e6fa521
JA
1455 if (def->unbound_nonreg_file)
1456 req->work.flags |= IO_WQ_WORK_UNBOUND;
1457 }
e1d675df
PB
1458
1459 switch (req->opcode) {
1460 case IORING_OP_SPLICE:
1461 case IORING_OP_TEE:
e1d675df
PB
1462 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1463 req->work.flags |= IO_WQ_WORK_UNBOUND;
1464 break;
1465 }
561fb04a 1466}
cccf0ee8 1467
cbdcb435 1468static void io_prep_async_link(struct io_kiocb *req)
561fb04a 1469{
cbdcb435 1470 struct io_kiocb *cur;
54a91f3b 1471
44eff40a
PB
1472 if (req->flags & REQ_F_LINK_TIMEOUT) {
1473 struct io_ring_ctx *ctx = req->ctx;
1474
79ebeaee 1475 spin_lock(&ctx->completion_lock);
44eff40a
PB
1476 io_for_each_link(cur, req)
1477 io_prep_async_work(cur);
79ebeaee 1478 spin_unlock(&ctx->completion_lock);
44eff40a
PB
1479 } else {
1480 io_for_each_link(cur, req)
1481 io_prep_async_work(cur);
1482 }
561fb04a
JA
1483}
1484
fff4e40e
PB
1485static inline void io_req_add_compl_list(struct io_kiocb *req)
1486{
1487 struct io_submit_state *state = &req->ctx->submit_state;
1488
1489 wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1490}
1491
f237c30a 1492static void io_queue_async_work(struct io_kiocb *req, bool *locked)
561fb04a 1493{
a197f664 1494 struct io_ring_ctx *ctx = req->ctx;
cbdcb435 1495 struct io_kiocb *link = io_prep_linked_timeout(req);
5aa75ed5 1496 struct io_uring_task *tctx = req->task->io_uring;
561fb04a 1497
f237c30a
PB
1498 /* must not take the lock, NULL it as a precaution */
1499 locked = NULL;
1500
3bfe6106
JA
1501 BUG_ON(!tctx);
1502 BUG_ON(!tctx->io_wq);
561fb04a 1503
cbdcb435
PB
1504 /* init ->work of the whole link before punting */
1505 io_prep_async_link(req);
991468dc
JA
1506
1507 /*
1508 * Not expected to happen, but if we do have a bug where this _can_
1509 * happen, catch it here and ensure the request is marked as
1510 * canceled. That will make io-wq go through the usual work cancel
1511 * procedure rather than attempt to run this request (or create a new
1512 * worker for it).
1513 */
1514 if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1515 req->work.flags |= IO_WQ_WORK_CANCEL;
1516
d07f1e8a
PB
1517 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1518 &req->work, req->flags);
ebf93667 1519 io_wq_enqueue(tctx->io_wq, &req->work);
7271ef3a
JA
1520 if (link)
1521 io_queue_linked_timeout(link);
cbdcb435
PB
1522}
1523
1ee4160c 1524static void io_kill_timeout(struct io_kiocb *req, int status)
8c855885 1525 __must_hold(&req->ctx->completion_lock)
89850fce 1526 __must_hold(&req->ctx->timeout_lock)
5262f567 1527{
e8c2bc1f 1528 struct io_timeout_data *io = req->async_data;
5262f567 1529
fd9c7bc5 1530 if (hrtimer_try_to_cancel(&io->timer) != -1) {
2ae2eb9d
PB
1531 if (status)
1532 req_set_fail(req);
01cec8c1
PB
1533 atomic_set(&req->ctx->cq_timeouts,
1534 atomic_read(&req->ctx->cq_timeouts) + 1);
135fcde8 1535 list_del_init(&req->timeout.list);
d4d19c19 1536 io_cqring_fill_event(req->ctx, req->user_data, status, 0);
91c2f697 1537 io_put_req_deferred(req);
5262f567
JA
1538 }
1539}
1540
c072481d 1541static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
de0617e4 1542{
441b8a78 1543 while (!list_empty(&ctx->defer_list)) {
27dc8338
PB
1544 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1545 struct io_defer_entry, list);
de0617e4 1546
9cf7c104 1547 if (req_need_defer(de->req, de->seq))
04518945 1548 break;
27dc8338 1549 list_del_init(&de->list);
907d1df3 1550 io_req_task_queue(de->req);
27dc8338 1551 kfree(de);
441b8a78 1552 }
04518945
PB
1553}
1554
c072481d 1555static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
89850fce 1556 __must_hold(&ctx->completion_lock)
de0617e4 1557{
441b8a78 1558 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
f010505b 1559
79ebeaee 1560 spin_lock_irq(&ctx->timeout_lock);
f18ee4cf 1561 while (!list_empty(&ctx->timeout_list)) {
f010505b 1562 u32 events_needed, events_got;
360428f8 1563 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
135fcde8 1564 struct io_kiocb, timeout.list);
de0617e4 1565
8eb7e2d0 1566 if (io_is_timeout_noseq(req))
360428f8 1567 break;
f010505b
MDG
1568
1569 /*
1570 * Since seq can easily wrap around over time, subtract
1571 * the last seq at which timeouts were flushed before comparing.
1572 * Assuming not more than 2^31-1 events have happened since,
1573 * these subtractions won't have wrapped, so we can check if
1574 * target is in [last_seq, current_seq] by comparing the two.
1575 */
1576 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1577 events_got = seq - ctx->cq_last_tm_flush;
1578 if (events_got < events_needed)
360428f8 1579 break;
bfe68a22 1580
135fcde8 1581 list_del_init(&req->timeout.list);
1ee4160c 1582 io_kill_timeout(req, 0);
f18ee4cf 1583 }
f010505b 1584 ctx->cq_last_tm_flush = seq;
79ebeaee 1585 spin_unlock_irq(&ctx->timeout_lock);
360428f8 1586}
5262f567 1587
c072481d 1588static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
360428f8 1589{
2335f6f5
PB
1590 if (ctx->off_timeout_used)
1591 io_flush_timeouts(ctx);
1592 if (ctx->drain_active)
1593 io_queue_deferred(ctx);
1594}
1595
1596static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1597{
1598 if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1599 __io_commit_cqring_flush(ctx);
ec30e04b
PB
1600 /* order cqe stores with ring update */
1601 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
de0617e4
JA
1602}
1603
90554200
JA
1604static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1605{
1606 struct io_rings *r = ctx->rings;
1607
a566c556 1608 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
90554200
JA
1609}
1610
888aae2e
PB
1611static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1612{
1613 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1614}
1615
d068b506 1616static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
2b188cc1 1617{
75b28aff 1618 struct io_rings *rings = ctx->rings;
ea5ab3b5 1619 unsigned tail, mask = ctx->cq_entries - 1;
2b188cc1 1620
115e12e5
SB
1621 /*
1622 * writes to the cq entry need to come after reading head; the
1623 * control dependency is enough as we're using WRITE_ONCE to
1624 * fill the cq entry
1625 */
a566c556 1626 if (__io_cqring_events(ctx) == ctx->cq_entries)
2b188cc1
JA
1627 return NULL;
1628
888aae2e 1629 tail = ctx->cached_cq_tail++;
ea5ab3b5 1630 return &rings->cqes[tail & mask];
2b188cc1
JA
1631}
1632
f2842ab5
JA
1633static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1634{
44c769de 1635 if (likely(!ctx->cq_ev_fd))
f0b493e6 1636 return false;
7e55a19c
SG
1637 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1638 return false;
44c769de 1639 return !ctx->eventfd_async || io_wq_current_is_worker();
f2842ab5
JA
1640}
1641
2c5d763c
JA
1642/*
1643 * This should only get called when at least one event has been posted.
1644 * Some applications rely on the eventfd notification count only changing
1645 * IFF a new CQE has been added to the CQ ring. There's no depedency on
1646 * 1:1 relationship between how many times this function is called (and
1647 * hence the eventfd count) and number of CQEs posted to the CQ ring.
1648 */
b41e9852 1649static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5 1650{
5fd46178
JA
1651 /*
1652 * wake_up_all() may seem excessive, but io_wake_function() and
1653 * io_should_wake() handle the termination of the loop and only
1654 * wake as many waiters as we need to.
1655 */
1656 if (wq_has_sleeper(&ctx->cq_wait))
1657 wake_up_all(&ctx->cq_wait);
b41e9852 1658 if (io_should_trigger_evfd(ctx))
1d7bb1d5
JA
1659 eventfd_signal(ctx->cq_ev_fd, 1);
1660}
1661
80c18e4a
PB
1662static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1663{
c57a91fb
PB
1664 /* see waitqueue_active() comment */
1665 smp_mb();
1666
80c18e4a 1667 if (ctx->flags & IORING_SETUP_SQPOLL) {
c57a91fb 1668 if (waitqueue_active(&ctx->cq_wait))
5fd46178 1669 wake_up_all(&ctx->cq_wait);
80c18e4a
PB
1670 }
1671 if (io_should_trigger_evfd(ctx))
1672 eventfd_signal(ctx->cq_ev_fd, 1);
1673}
1674
c4a2ed72 1675/* Returns true if there are no backlogged entries after the flush */
6c2450ae 1676static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5 1677{
b18032bb 1678 bool all_flushed, posted;
1d7bb1d5 1679
a566c556 1680 if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
e23de15f 1681 return false;
1d7bb1d5 1682
b18032bb 1683 posted = false;
79ebeaee 1684 spin_lock(&ctx->completion_lock);
6c2450ae 1685 while (!list_empty(&ctx->cq_overflow_list)) {
d068b506 1686 struct io_uring_cqe *cqe = io_get_cqe(ctx);
6c2450ae 1687 struct io_overflow_cqe *ocqe;
e6c8aa9a 1688
1d7bb1d5
JA
1689 if (!cqe && !force)
1690 break;
6c2450ae
PB
1691 ocqe = list_first_entry(&ctx->cq_overflow_list,
1692 struct io_overflow_cqe, list);
1693 if (cqe)
1694 memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1695 else
8f6ed49a
PB
1696 io_account_cq_overflow(ctx);
1697
b18032bb 1698 posted = true;
6c2450ae
PB
1699 list_del(&ocqe->list);
1700 kfree(ocqe);
1d7bb1d5
JA
1701 }
1702
09e88404
PB
1703 all_flushed = list_empty(&ctx->cq_overflow_list);
1704 if (all_flushed) {
5ed7a37d 1705 clear_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
1706 WRITE_ONCE(ctx->rings->sq_flags,
1707 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
09e88404 1708 }
46930143 1709
b18032bb
JA
1710 if (posted)
1711 io_commit_cqring(ctx);
79ebeaee 1712 spin_unlock(&ctx->completion_lock);
b18032bb
JA
1713 if (posted)
1714 io_cqring_ev_posted(ctx);
09e88404 1715 return all_flushed;
1d7bb1d5
JA
1716}
1717
90f67366 1718static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
6c503150 1719{
ca0a2651
JA
1720 bool ret = true;
1721
5ed7a37d 1722 if (test_bit(0, &ctx->check_cq_overflow)) {
6c503150
PB
1723 /* iopoll syncs against uring_lock, not completion_lock */
1724 if (ctx->flags & IORING_SETUP_IOPOLL)
1725 mutex_lock(&ctx->uring_lock);
90f67366 1726 ret = __io_cqring_overflow_flush(ctx, false);
6c503150
PB
1727 if (ctx->flags & IORING_SETUP_IOPOLL)
1728 mutex_unlock(&ctx->uring_lock);
1729 }
ca0a2651
JA
1730
1731 return ret;
6c503150
PB
1732}
1733
6a290a14
PB
1734/* must to be called somewhat shortly after putting a request */
1735static inline void io_put_task(struct task_struct *task, int nr)
1736{
1737 struct io_uring_task *tctx = task->io_uring;
1738
e98e49b2
PB
1739 if (likely(task == current)) {
1740 tctx->cached_refs += nr;
1741 } else {
1742 percpu_counter_sub(&tctx->inflight, nr);
1743 if (unlikely(atomic_read(&tctx->in_idle)))
1744 wake_up(&tctx->wait);
1745 put_task_struct_many(task, nr);
1746 }
6a290a14
PB
1747}
1748
9a10867a
PB
1749static void io_task_refs_refill(struct io_uring_task *tctx)
1750{
1751 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
1752
1753 percpu_counter_add(&tctx->inflight, refill);
1754 refcount_add(refill, &current->usage);
1755 tctx->cached_refs += refill;
1756}
1757
1758static inline void io_get_task_refs(int nr)
1759{
1760 struct io_uring_task *tctx = current->io_uring;
1761
1762 tctx->cached_refs -= nr;
1763 if (unlikely(tctx->cached_refs < 0))
1764 io_task_refs_refill(tctx);
1765}
1766
d4d19c19 1767static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 1768 s32 res, u32 cflags)
2b188cc1 1769{
cce4b8b0 1770 struct io_overflow_cqe *ocqe;
2b188cc1 1771
cce4b8b0
PB
1772 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1773 if (!ocqe) {
1774 /*
1775 * If we're in ring overflow flush mode, or in task cancel mode,
1776 * or cannot allocate an overflow entry, then we need to drop it
1777 * on the floor.
1778 */
8f6ed49a 1779 io_account_cq_overflow(ctx);
cce4b8b0 1780 return false;
2b188cc1 1781 }
cce4b8b0 1782 if (list_empty(&ctx->cq_overflow_list)) {
5ed7a37d 1783 set_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
1784 WRITE_ONCE(ctx->rings->sq_flags,
1785 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1786
cce4b8b0 1787 }
d4d19c19 1788 ocqe->cqe.user_data = user_data;
cce4b8b0
PB
1789 ocqe->cqe.res = res;
1790 ocqe->cqe.flags = cflags;
1791 list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1792 return true;
2b188cc1
JA
1793}
1794
d4d19c19 1795static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 1796 s32 res, u32 cflags)
2b188cc1
JA
1797{
1798 struct io_uring_cqe *cqe;
1799
d4d19c19 1800 trace_io_uring_complete(ctx, user_data, res, cflags);
51c3ff62 1801
2b188cc1
JA
1802 /*
1803 * If we can't get a cq entry, userspace overflowed the
1804 * submission (by quite a lot). Increment the overflow count in
1805 * the ring.
1806 */
d068b506 1807 cqe = io_get_cqe(ctx);
1d7bb1d5 1808 if (likely(cqe)) {
d4d19c19 1809 WRITE_ONCE(cqe->user_data, user_data);
2b188cc1 1810 WRITE_ONCE(cqe->res, res);
bcda7baa 1811 WRITE_ONCE(cqe->flags, cflags);
8d13326e 1812 return true;
2b188cc1 1813 }
d4d19c19 1814 return io_cqring_event_overflow(ctx, user_data, res, cflags);
2b188cc1
JA
1815}
1816
8d13326e 1817/* not as hot to bloat with inlining */
d4d19c19 1818static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 1819 s32 res, u32 cflags)
bcda7baa 1820{
d4d19c19 1821 return __io_cqring_fill_event(ctx, user_data, res, cflags);
bcda7baa
JA
1822}
1823
54daa9b2
PB
1824static void io_req_complete_post(struct io_kiocb *req, s32 res,
1825 u32 cflags)
2b188cc1 1826{
78e19bbe 1827 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 1828
79ebeaee 1829 spin_lock(&ctx->completion_lock);
d4d19c19 1830 __io_cqring_fill_event(ctx, req->user_data, res, cflags);
c7dae4ba
JA
1831 /*
1832 * If we're the last reference to this request, add to our locked
1833 * free_list cache.
1834 */
de9b4cca 1835 if (req_ref_put_and_test(req)) {
7a612350 1836 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
0756a869 1837 if (req->flags & IO_DISARM_MASK)
7a612350
PB
1838 io_disarm_next(req);
1839 if (req->link) {
1840 io_req_task_queue(req->link);
1841 req->link = NULL;
1842 }
1843 }
ab409402 1844 io_req_put_rsrc(req, ctx);
c7dae4ba
JA
1845 io_dismantle_req(req);
1846 io_put_task(req->task, 1);
c2b6c6bc 1847 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
d0acdee2 1848 ctx->locked_free_nr++;
180f829f 1849 }
7a612350 1850 io_commit_cqring(ctx);
79ebeaee 1851 spin_unlock(&ctx->completion_lock);
a3f34907 1852 io_cqring_ev_posted(ctx);
229a7b63
JA
1853}
1854
54daa9b2
PB
1855static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
1856 u32 cflags)
229a7b63 1857{
a38d68db 1858 req->result = res;
d17e56eb 1859 req->cflags = cflags;
e342c807 1860 req->flags |= REQ_F_COMPLETE_INLINE;
e1e16097
JA
1861}
1862
889fca73 1863static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
54daa9b2 1864 s32 res, u32 cflags)
bcda7baa 1865{
889fca73
PB
1866 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1867 io_req_complete_state(req, res, cflags);
a38d68db 1868 else
c7dae4ba 1869 io_req_complete_post(req, res, cflags);
bcda7baa
JA
1870}
1871
54daa9b2 1872static inline void io_req_complete(struct io_kiocb *req, s32 res)
0ddf92e8 1873{
889fca73 1874 __io_req_complete(req, 0, res, 0);
0ddf92e8
JA
1875}
1876
54daa9b2 1877static void io_req_complete_failed(struct io_kiocb *req, s32 res)
f41db273 1878{
93d2bcd2 1879 req_set_fail(req);
f41db273
PB
1880 io_req_complete_post(req, res, 0);
1881}
1882
c6d3d9cb
PB
1883static void io_req_complete_fail_submit(struct io_kiocb *req)
1884{
1885 /*
1886 * We don't submit, fail them all, for that replace hardlinks with
1887 * normal links. Extra REQ_F_LINK is tolerated.
1888 */
1889 req->flags &= ~REQ_F_HARDLINK;
1890 req->flags |= REQ_F_LINK;
1891 io_req_complete_failed(req, req->result);
1892}
1893
864ea921
PB
1894/*
1895 * Don't initialise the fields below on every allocation, but do that in
1896 * advance and keep them valid across allocations.
1897 */
1898static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1899{
1900 req->ctx = ctx;
1901 req->link = NULL;
1902 req->async_data = NULL;
1903 /* not necessary, but safer to zero */
1904 req->result = 0;
1905}
1906
dac7a098 1907static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
cd0ca2e0 1908 struct io_submit_state *state)
dac7a098 1909{
79ebeaee 1910 spin_lock(&ctx->completion_lock);
c2b6c6bc 1911 wq_list_splice(&ctx->locked_free_list, &state->free_list);
d0acdee2 1912 ctx->locked_free_nr = 0;
79ebeaee 1913 spin_unlock(&ctx->completion_lock);
dac7a098
PB
1914}
1915
dd78f492 1916/* Returns true IFF there are requests in the cache */
c7dae4ba 1917static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
0ddf92e8 1918{
c7dae4ba 1919 struct io_submit_state *state = &ctx->submit_state;
0ddf92e8 1920
c7dae4ba
JA
1921 /*
1922 * If we have more than a batch's worth of requests in our IRQ side
1923 * locked cache, grab the lock and move them over to our submission
1924 * side cache.
1925 */
d0acdee2 1926 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
cd0ca2e0 1927 io_flush_cached_locked_reqs(ctx, state);
c2b6c6bc 1928 return !!state->free_list.next;
0ddf92e8
JA
1929}
1930
5d5901a3
PB
1931/*
1932 * A request might get retired back into the request caches even before opcode
1933 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
1934 * Because of that, io_alloc_req() should be called only under ->uring_lock
1935 * and with extra caution to not get a request that is still worked on.
1936 */
c072481d 1937static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
5d5901a3 1938 __must_hold(&ctx->uring_lock)
2b188cc1 1939{
e5d1bc0a 1940 struct io_submit_state *state = &ctx->submit_state;
864ea921 1941 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
3ab665b7
PB
1942 void *reqs[IO_REQ_ALLOC_BATCH];
1943 struct io_kiocb *req;
864ea921 1944 int ret, i;
e5d1bc0a 1945
c2b6c6bc 1946 if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
a33ae9ce 1947 return true;
e5d1bc0a 1948
3ab665b7 1949 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
fd6fab2c 1950
864ea921
PB
1951 /*
1952 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1953 * retry single alloc to be on the safe side.
1954 */
1955 if (unlikely(ret <= 0)) {
3ab665b7
PB
1956 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1957 if (!reqs[0])
a33ae9ce 1958 return false;
864ea921 1959 ret = 1;
2b188cc1 1960 }
864ea921 1961
37f0e767 1962 percpu_ref_get_many(&ctx->refs, ret);
3ab665b7
PB
1963 for (i = 0; i < ret; i++) {
1964 req = reqs[i];
1965
1966 io_preinit_req(req, ctx);
c2b6c6bc 1967 wq_stack_add_head(&req->comp_list, &state->free_list);
3ab665b7 1968 }
a33ae9ce
PB
1969 return true;
1970}
1971
1972static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
1973{
1974 if (unlikely(!ctx->submit_state.free_list.next))
1975 return __io_alloc_req_refill(ctx);
1976 return true;
1977}
1978
1979static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1980{
1981 struct io_wq_work_node *node;
1982
1983 node = wq_stack_extract(&ctx->submit_state.free_list);
c2b6c6bc 1984 return container_of(node, struct io_kiocb, comp_list);
2b188cc1
JA
1985}
1986
e1d767f0 1987static inline void io_put_file(struct file *file)
8da11c19 1988{
e1d767f0 1989 if (file)
8da11c19
PB
1990 fput(file);
1991}
1992
6b639522 1993static inline void io_dismantle_req(struct io_kiocb *req)
2b188cc1 1994{
094bae49 1995 unsigned int flags = req->flags;
929a3af9 1996
867f8fa5 1997 if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
3a0a6902 1998 io_clean_op(req);
e1d767f0
PB
1999 if (!(flags & REQ_F_FIXED_FILE))
2000 io_put_file(req->file);
e65ef56d
JA
2001}
2002
c072481d 2003static __cold void __io_free_req(struct io_kiocb *req)
c6ca97b3 2004{
51a4cc11 2005 struct io_ring_ctx *ctx = req->ctx;
c6ca97b3 2006
ab409402 2007 io_req_put_rsrc(req, ctx);
216578e5 2008 io_dismantle_req(req);
7c660731 2009 io_put_task(req->task, 1);
c6ca97b3 2010
79ebeaee 2011 spin_lock(&ctx->completion_lock);
c2b6c6bc 2012 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
c34b025f 2013 ctx->locked_free_nr++;
79ebeaee 2014 spin_unlock(&ctx->completion_lock);
e65ef56d
JA
2015}
2016
f2f87370
PB
2017static inline void io_remove_next_linked(struct io_kiocb *req)
2018{
2019 struct io_kiocb *nxt = req->link;
2020
2021 req->link = nxt->link;
2022 nxt->link = NULL;
2023}
2024
33cc89a9
PB
2025static bool io_kill_linked_timeout(struct io_kiocb *req)
2026 __must_hold(&req->ctx->completion_lock)
89b263f6 2027 __must_hold(&req->ctx->timeout_lock)
2665abfd 2028{
33cc89a9 2029 struct io_kiocb *link = req->link;
f2f87370 2030
b97e736a 2031 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
c9abd7ad 2032 struct io_timeout_data *io = link->async_data;
7c86ffee 2033
f2f87370 2034 io_remove_next_linked(req);
90cd7e42 2035 link->timeout.head = NULL;
fd9c7bc5 2036 if (hrtimer_try_to_cancel(&io->timer) != -1) {
ef9dd637 2037 list_del(&link->timeout.list);
d4d19c19
PB
2038 io_cqring_fill_event(link->ctx, link->user_data,
2039 -ECANCELED, 0);
91c2f697 2040 io_put_req_deferred(link);
d4729fbd 2041 return true;
c9abd7ad
PB
2042 }
2043 }
d4729fbd 2044 return false;
7c86ffee
PB
2045}
2046
d148ca4b 2047static void io_fail_links(struct io_kiocb *req)
33cc89a9 2048 __must_hold(&req->ctx->completion_lock)
9e645e11 2049{
33cc89a9 2050 struct io_kiocb *nxt, *link = req->link;
9e645e11 2051
f2f87370 2052 req->link = NULL;
f2f87370 2053 while (link) {
a8295b98
HX
2054 long res = -ECANCELED;
2055
2056 if (link->flags & REQ_F_FAIL)
2057 res = link->result;
2058
f2f87370
PB
2059 nxt = link->link;
2060 link->link = NULL;
2665abfd 2061
f2f87370 2062 trace_io_uring_fail_link(req, link);
a8295b98 2063 io_cqring_fill_event(link->ctx, link->user_data, res, 0);
91c2f697 2064 io_put_req_deferred(link);
f2f87370 2065 link = nxt;
9e645e11 2066 }
33cc89a9 2067}
9e645e11 2068
33cc89a9
PB
2069static bool io_disarm_next(struct io_kiocb *req)
2070 __must_hold(&req->ctx->completion_lock)
2071{
2072 bool posted = false;
2073
0756a869
PB
2074 if (req->flags & REQ_F_ARM_LTIMEOUT) {
2075 struct io_kiocb *link = req->link;
2076
906c6caa 2077 req->flags &= ~REQ_F_ARM_LTIMEOUT;
0756a869
PB
2078 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2079 io_remove_next_linked(req);
2080 io_cqring_fill_event(link->ctx, link->user_data,
2081 -ECANCELED, 0);
2082 io_put_req_deferred(link);
2083 posted = true;
2084 }
2085 } else if (req->flags & REQ_F_LINK_TIMEOUT) {
89b263f6
JA
2086 struct io_ring_ctx *ctx = req->ctx;
2087
2088 spin_lock_irq(&ctx->timeout_lock);
33cc89a9 2089 posted = io_kill_linked_timeout(req);
89b263f6
JA
2090 spin_unlock_irq(&ctx->timeout_lock);
2091 }
93d2bcd2 2092 if (unlikely((req->flags & REQ_F_FAIL) &&
e4335ed3 2093 !(req->flags & REQ_F_HARDLINK))) {
33cc89a9
PB
2094 posted |= (req->link != NULL);
2095 io_fail_links(req);
2096 }
2097 return posted;
9e645e11
JA
2098}
2099
d81499bf
PB
2100static void __io_req_find_next_prep(struct io_kiocb *req)
2101{
2102 struct io_ring_ctx *ctx = req->ctx;
2103 bool posted;
2104
2105 spin_lock(&ctx->completion_lock);
2106 posted = io_disarm_next(req);
2107 if (posted)
2108 io_commit_cqring(req->ctx);
2109 spin_unlock(&ctx->completion_lock);
2110 if (posted)
2111 io_cqring_ev_posted(ctx);
2112}
2113
2114static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
c69f8dbe 2115{
33cc89a9 2116 struct io_kiocb *nxt;
944e58bf 2117
d81499bf
PB
2118 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2119 return NULL;
9e645e11
JA
2120 /*
2121 * If LINK is set, we have dependent requests in this chain. If we
2122 * didn't fail this request, queue the first one up, moving any other
2123 * dependencies to the next request. In case of failure, fail the rest
2124 * of the chain.
2125 */
d81499bf
PB
2126 if (unlikely(req->flags & IO_DISARM_MASK))
2127 __io_req_find_next_prep(req);
33cc89a9
PB
2128 nxt = req->link;
2129 req->link = NULL;
2130 return nxt;
4d7dd462 2131}
9e645e11 2132
f237c30a 2133static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2c32395d
PB
2134{
2135 if (!ctx)
2136 return;
f237c30a 2137 if (*locked) {
c450178d 2138 io_submit_flush_completions(ctx);
2c32395d 2139 mutex_unlock(&ctx->uring_lock);
f237c30a 2140 *locked = false;
2c32395d
PB
2141 }
2142 percpu_ref_put(&ctx->refs);
2143}
2144
7cbf1722 2145static void tctx_task_work(struct callback_head *cb)
c40f6379 2146{
f237c30a 2147 bool locked = false;
ebd0df2e 2148 struct io_ring_ctx *ctx = NULL;
3f18407d
PB
2149 struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2150 task_work);
c40f6379 2151
16f72070 2152 while (1) {
3f18407d
PB
2153 struct io_wq_work_node *node;
2154
c450178d 2155 if (!tctx->task_list.first && locked)
8d4ad41e
PB
2156 io_submit_flush_completions(ctx);
2157
3f18407d 2158 spin_lock_irq(&tctx->task_lock);
c6538be9 2159 node = tctx->task_list.first;
3f18407d 2160 INIT_WQ_LIST(&tctx->task_list);
6294f368
PB
2161 if (!node)
2162 tctx->task_running = false;
3f18407d 2163 spin_unlock_irq(&tctx->task_lock);
6294f368
PB
2164 if (!node)
2165 break;
3f18407d 2166
6294f368 2167 do {
3f18407d
PB
2168 struct io_wq_work_node *next = node->next;
2169 struct io_kiocb *req = container_of(node, struct io_kiocb,
2170 io_task_work.node);
2171
2172 if (req->ctx != ctx) {
f237c30a 2173 ctx_flush_and_put(ctx, &locked);
3f18407d 2174 ctx = req->ctx;
126180b9
PB
2175 /* if not contended, grab and improve batching */
2176 locked = mutex_trylock(&ctx->uring_lock);
3f18407d
PB
2177 percpu_ref_get(&ctx->refs);
2178 }
f237c30a 2179 req->io_task_work.func(req, &locked);
3f18407d 2180 node = next;
6294f368
PB
2181 } while (node);
2182
7cbf1722 2183 cond_resched();
3f18407d 2184 }
ebd0df2e 2185
f237c30a 2186 ctx_flush_and_put(ctx, &locked);
7cbf1722
JA
2187}
2188
e09ee510 2189static void io_req_task_work_add(struct io_kiocb *req)
7cbf1722 2190{
c15b79de 2191 struct task_struct *tsk = req->task;
7cbf1722 2192 struct io_uring_task *tctx = tsk->io_uring;
c15b79de 2193 enum task_work_notify_mode notify;
e09ee510 2194 struct io_wq_work_node *node;
0b81e80c 2195 unsigned long flags;
6294f368 2196 bool running;
7cbf1722
JA
2197
2198 WARN_ON_ONCE(!tctx);
2199
0b81e80c 2200 spin_lock_irqsave(&tctx->task_lock, flags);
7cbf1722 2201 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
6294f368
PB
2202 running = tctx->task_running;
2203 if (!running)
2204 tctx->task_running = true;
0b81e80c 2205 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
2206
2207 /* task_work already pending, we're done */
6294f368 2208 if (running)
e09ee510 2209 return;
7cbf1722 2210
c15b79de
PB
2211 /*
2212 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2213 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2214 * processing task_work. There's no reliable way to tell if TWA_RESUME
2215 * will do the job.
2216 */
2217 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
d97ec623
PB
2218 if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
2219 if (notify == TWA_NONE)
2220 wake_up_process(tsk);
e09ee510 2221 return;
c15b79de 2222 }
2215bed9 2223
0b81e80c 2224 spin_lock_irqsave(&tctx->task_lock, flags);
6294f368 2225 tctx->task_running = false;
e09ee510
PB
2226 node = tctx->task_list.first;
2227 INIT_WQ_LIST(&tctx->task_list);
0b81e80c 2228 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722 2229
e09ee510
PB
2230 while (node) {
2231 req = container_of(node, struct io_kiocb, io_task_work.node);
2232 node = node->next;
2233 if (llist_add(&req->io_task_work.fallback_node,
2234 &req->ctx->fallback_llist))
2235 schedule_delayed_work(&req->ctx->fallback_work, 1);
2236 }
eab30c4d
PB
2237}
2238
f237c30a 2239static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
c40f6379 2240{
87ceb6a6 2241 struct io_ring_ctx *ctx = req->ctx;
c40f6379 2242
b18a1a45 2243 /* not needed for normal modes, but SQPOLL depends on it */
f237c30a 2244 io_tw_lock(ctx, locked);
2593553a 2245 io_req_complete_failed(req, req->result);
c40f6379
JA
2246}
2247
f237c30a 2248static void io_req_task_submit(struct io_kiocb *req, bool *locked)
c40f6379
JA
2249{
2250 struct io_ring_ctx *ctx = req->ctx;
2251
f237c30a 2252 io_tw_lock(ctx, locked);
316319e8 2253 /* req->task == current here, checking PF_EXITING is safe */
af066f31 2254 if (likely(!(req->task->flags & PF_EXITING)))
c5eef2b9 2255 __io_queue_sqe(req);
81b6d05c 2256 else
2593553a 2257 io_req_complete_failed(req, -EFAULT);
c40f6379
JA
2258}
2259
2c4b8eb6 2260static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
c40f6379 2261{
2c4b8eb6 2262 req->result = ret;
5b0a6acc 2263 req->io_task_work.func = io_req_task_cancel;
e09ee510 2264 io_req_task_work_add(req);
c40f6379
JA
2265}
2266
2c4b8eb6 2267static void io_req_task_queue(struct io_kiocb *req)
a3df7698 2268{
5b0a6acc 2269 req->io_task_work.func = io_req_task_submit;
e09ee510 2270 io_req_task_work_add(req);
a3df7698
PB
2271}
2272
773af691
JA
2273static void io_req_task_queue_reissue(struct io_kiocb *req)
2274{
2275 req->io_task_work.func = io_queue_async_work;
2276 io_req_task_work_add(req);
2277}
2278
f2f87370 2279static inline void io_queue_next(struct io_kiocb *req)
c69f8dbe 2280{
9b5f7bd9 2281 struct io_kiocb *nxt = io_req_find_next(req);
944e58bf
PB
2282
2283 if (nxt)
906a8c3f 2284 io_req_task_queue(nxt);
c69f8dbe
JL
2285}
2286
c3524383 2287static void io_free_req(struct io_kiocb *req)
7a743e22 2288{
c3524383
PB
2289 io_queue_next(req);
2290 __io_free_req(req);
2291}
8766dd51 2292
f237c30a
PB
2293static void io_free_req_work(struct io_kiocb *req, bool *locked)
2294{
2295 io_free_req(req);
2296}
2297
3aa83bfb 2298static void io_free_batch_list(struct io_ring_ctx *ctx,
1cce17ac 2299 struct io_wq_work_node *node)
3aa83bfb
PB
2300 __must_hold(&ctx->uring_lock)
2301{
d4b7a5ef 2302 struct task_struct *task = NULL;
37f0e767 2303 int task_refs = 0;
3aa83bfb 2304
3aa83bfb
PB
2305 do {
2306 struct io_kiocb *req = container_of(node, struct io_kiocb,
2307 comp_list);
2308
def77acf 2309 if (unlikely(req->flags & REQ_F_REFCOUNT)) {
c1e53a69 2310 node = req->comp_list.next;
def77acf
PB
2311 if (!req_ref_put_and_test(req))
2312 continue;
c1e53a69 2313 }
d4b7a5ef 2314
ab409402 2315 io_req_put_rsrc_locked(req, ctx);
d4b7a5ef
PB
2316 io_queue_next(req);
2317 io_dismantle_req(req);
2318
2319 if (req->task != task) {
2320 if (task)
2321 io_put_task(task, task_refs);
2322 task = req->task;
2323 task_refs = 0;
2324 }
2325 task_refs++;
c1e53a69 2326 node = req->comp_list.next;
d4b7a5ef 2327 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
3aa83bfb 2328 } while (node);
d4b7a5ef 2329
d4b7a5ef
PB
2330 if (task)
2331 io_put_task(task, task_refs);
3aa83bfb
PB
2332}
2333
c450178d 2334static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
a141dd89 2335 __must_hold(&ctx->uring_lock)
905c172f 2336{
6f33b0bc 2337 struct io_wq_work_node *node, *prev;
cd0ca2e0 2338 struct io_submit_state *state = &ctx->submit_state;
905c172f 2339
79ebeaee 2340 spin_lock(&ctx->completion_lock);
6f33b0bc
PB
2341 wq_list_for_each(node, prev, &state->compl_reqs) {
2342 struct io_kiocb *req = container_of(node, struct io_kiocb,
2343 comp_list);
5182ed2e 2344
d4d19c19 2345 __io_cqring_fill_event(ctx, req->user_data, req->result,
d17e56eb 2346 req->cflags);
905c172f
PB
2347 }
2348 io_commit_cqring(ctx);
79ebeaee 2349 spin_unlock(&ctx->completion_lock);
905c172f 2350 io_cqring_ev_posted(ctx);
5182ed2e 2351
1cce17ac 2352 io_free_batch_list(ctx, state->compl_reqs.first);
6f33b0bc 2353 INIT_WQ_LIST(&state->compl_reqs);
7a743e22
PB
2354}
2355
ba816ad6
JA
2356/*
2357 * Drop reference to request, return next in chain (if there is one) if this
2358 * was the last reference to this request.
2359 */
0d85035a 2360static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
e65ef56d 2361{
9b5f7bd9
PB
2362 struct io_kiocb *nxt = NULL;
2363
de9b4cca 2364 if (req_ref_put_and_test(req)) {
9b5f7bd9 2365 nxt = io_req_find_next(req);
4d7dd462 2366 __io_free_req(req);
2a44f467 2367 }
9b5f7bd9 2368 return nxt;
2b188cc1
JA
2369}
2370
0d85035a 2371static inline void io_put_req(struct io_kiocb *req)
e65ef56d 2372{
de9b4cca 2373 if (req_ref_put_and_test(req))
e65ef56d 2374 io_free_req(req);
2b188cc1
JA
2375}
2376
91c2f697 2377static inline void io_put_req_deferred(struct io_kiocb *req)
216578e5 2378{
91c2f697 2379 if (req_ref_put_and_test(req)) {
f237c30a 2380 req->io_task_work.func = io_free_req_work;
543af3a1
PB
2381 io_req_task_work_add(req);
2382 }
216578e5
PB
2383}
2384
6c503150 2385static unsigned io_cqring_events(struct io_ring_ctx *ctx)
a3a0e43f
JA
2386{
2387 /* See comment at the top of this file */
2388 smp_rmb();
e23de15f 2389 return __io_cqring_events(ctx);
a3a0e43f
JA
2390}
2391
fb5ccc98
PB
2392static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2393{
2394 struct io_rings *rings = ctx->rings;
2395
2396 /* make sure SQ entry isn't read before tail */
2397 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2398}
2399
8ff069bf 2400static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
e94f141b 2401{
8ff069bf 2402 unsigned int cflags;
e94f141b 2403
bcda7baa
JA
2404 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2405 cflags |= IORING_CQE_F_BUFFER;
0e1b6fe3 2406 req->flags &= ~REQ_F_BUFFER_SELECTED;
bcda7baa
JA
2407 kfree(kbuf);
2408 return cflags;
e94f141b
JA
2409}
2410
8ff069bf 2411static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
bcda7baa 2412{
ae421d93
PB
2413 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
2414 return 0;
30d51dd4 2415 return io_put_kbuf(req, req->kbuf);
8ff069bf
PB
2416}
2417
4c6e277c
JA
2418static inline bool io_run_task_work(void)
2419{
ef98eb04 2420 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
4c6e277c 2421 __set_current_state(TASK_RUNNING);
ef98eb04 2422 tracehook_notify_signal();
4c6e277c
JA
2423 return true;
2424 }
2425
2426 return false;
bcda7baa
JA
2427}
2428
5ba3c874 2429static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
def596e9 2430{
5eef4e87 2431 struct io_wq_work_node *pos, *start, *prev;
d729cf9a 2432 unsigned int poll_flags = BLK_POLL_NOSLEEP;
b688f11e 2433 DEFINE_IO_COMP_BATCH(iob);
5ba3c874 2434 int nr_events = 0;
def596e9
JA
2435
2436 /*
2437 * Only spin for completions if we don't have multiple devices hanging
87a115fb 2438 * off our complete list.
def596e9 2439 */
87a115fb 2440 if (ctx->poll_multi_queue || force_nonspin)
ef99b2d3 2441 poll_flags |= BLK_POLL_ONESHOT;
def596e9 2442
5eef4e87
PB
2443 wq_list_for_each(pos, start, &ctx->iopoll_list) {
2444 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
9adbd45d 2445 struct kiocb *kiocb = &req->rw.kiocb;
a2416e1e 2446 int ret;
def596e9
JA
2447
2448 /*
581f9810
BM
2449 * Move completed and retryable entries to our local lists.
2450 * If we find a request that requires polling, break out
2451 * and complete those lists first, if we have entries there.
def596e9 2452 */
e3f721e6 2453 if (READ_ONCE(req->iopoll_completed))
def596e9
JA
2454 break;
2455
b688f11e 2456 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
a2416e1e
PB
2457 if (unlikely(ret < 0))
2458 return ret;
2459 else if (ret)
ef99b2d3 2460 poll_flags |= BLK_POLL_ONESHOT;
def596e9 2461
3aadc23e 2462 /* iopoll may have completed current req */
b688f11e 2463 if (!rq_list_empty(iob.req_list) ||
e3f721e6
PB
2464 READ_ONCE(req->iopoll_completed))
2465 break;
def596e9
JA
2466 }
2467
b688f11e
JA
2468 if (!rq_list_empty(iob.req_list))
2469 iob.complete(&iob);
5eef4e87
PB
2470 else if (!pos)
2471 return 0;
2472
2473 prev = start;
2474 wq_list_for_each_resume(pos, prev) {
2475 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2476
b3fa03fd
PB
2477 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
2478 if (!smp_load_acquire(&req->iopoll_completed))
e3f721e6 2479 break;
b3fa03fd 2480 __io_cqring_fill_event(ctx, req->user_data, req->result,
f5ed3bcd 2481 io_put_rw_kbuf(req));
e3f721e6
PB
2482 nr_events++;
2483 }
def596e9 2484
f5ed3bcd
PB
2485 if (unlikely(!nr_events))
2486 return 0;
2487
2488 io_commit_cqring(ctx);
2489 io_cqring_ev_posted_iopoll(ctx);
1cce17ac 2490 pos = start ? start->next : ctx->iopoll_list.first;
5eef4e87 2491 wq_list_cut(&ctx->iopoll_list, prev, start);
1cce17ac 2492 io_free_batch_list(ctx, pos);
5ba3c874 2493 return nr_events;
def596e9
JA
2494}
2495
def596e9
JA
2496/*
2497 * We can't just wait for polled events to come to us, we have to actively
2498 * find and complete them.
2499 */
c072481d 2500static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
def596e9
JA
2501{
2502 if (!(ctx->flags & IORING_SETUP_IOPOLL))
2503 return;
2504
2505 mutex_lock(&ctx->uring_lock);
5eef4e87 2506 while (!wq_list_empty(&ctx->iopoll_list)) {
b2edc0a7 2507 /* let it sleep and repeat later if can't complete a request */
5ba3c874 2508 if (io_do_iopoll(ctx, true) == 0)
b2edc0a7 2509 break;
08f5439f
JA
2510 /*
2511 * Ensure we allow local-to-the-cpu processing to take place,
2512 * in this case we need to ensure that we reap all events.
3fcee5a6 2513 * Also let task_work, etc. to progress by releasing the mutex
08f5439f 2514 */
3fcee5a6
PB
2515 if (need_resched()) {
2516 mutex_unlock(&ctx->uring_lock);
2517 cond_resched();
2518 mutex_lock(&ctx->uring_lock);
2519 }
def596e9
JA
2520 }
2521 mutex_unlock(&ctx->uring_lock);
2522}
2523
7668b92a 2524static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
def596e9 2525{
7668b92a 2526 unsigned int nr_events = 0;
e9979b36 2527 int ret = 0;
500f9fba 2528
c7849be9
XW
2529 /*
2530 * We disallow the app entering submit/complete with polling, but we
2531 * still need to lock the ring to prevent racing with polled issue
2532 * that got punted to a workqueue.
2533 */
2534 mutex_lock(&ctx->uring_lock);
f39c8a5b
PB
2535 /*
2536 * Don't enter poll loop if we already have events pending.
2537 * If we do, we can potentially be spinning for commands that
2538 * already triggered a CQE (eg in error).
2539 */
5ed7a37d 2540 if (test_bit(0, &ctx->check_cq_overflow))
f39c8a5b
PB
2541 __io_cqring_overflow_flush(ctx, false);
2542 if (io_cqring_events(ctx))
2543 goto out;
def596e9 2544 do {
500f9fba
JA
2545 /*
2546 * If a submit got punted to a workqueue, we can have the
2547 * application entering polling for a command before it gets
2548 * issued. That app will hold the uring_lock for the duration
2549 * of the poll right here, so we need to take a breather every
2550 * now and then to ensure that the issue has a chance to add
2551 * the poll to the issued list. Otherwise we can spin here
2552 * forever, while the workqueue is stuck trying to acquire the
2553 * very same mutex.
2554 */
5eef4e87 2555 if (wq_list_empty(&ctx->iopoll_list)) {
8f487ef2
PB
2556 u32 tail = ctx->cached_cq_tail;
2557
500f9fba 2558 mutex_unlock(&ctx->uring_lock);
4c6e277c 2559 io_run_task_work();
500f9fba 2560 mutex_lock(&ctx->uring_lock);
def596e9 2561
8f487ef2
PB
2562 /* some requests don't go through iopoll_list */
2563 if (tail != ctx->cached_cq_tail ||
5eef4e87 2564 wq_list_empty(&ctx->iopoll_list))
e9979b36 2565 break;
500f9fba 2566 }
5ba3c874
PB
2567 ret = io_do_iopoll(ctx, !min);
2568 if (ret < 0)
2569 break;
2570 nr_events += ret;
2571 ret = 0;
2572 } while (nr_events < min && !need_resched());
f39c8a5b 2573out:
500f9fba 2574 mutex_unlock(&ctx->uring_lock);
def596e9
JA
2575 return ret;
2576}
2577
491381ce 2578static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 2579{
491381ce
JA
2580 /*
2581 * Tell lockdep we inherited freeze protection from submission
2582 * thread.
2583 */
2584 if (req->flags & REQ_F_ISREG) {
1c98679d 2585 struct super_block *sb = file_inode(req->file)->i_sb;
2b188cc1 2586
1c98679d
PB
2587 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2588 sb_end_write(sb);
2b188cc1
JA
2589 }
2590}
2591
b63534c4 2592#ifdef CONFIG_BLOCK
dc2a6e9a 2593static bool io_resubmit_prep(struct io_kiocb *req)
b63534c4 2594{
ab454438 2595 struct io_async_rw *rw = req->async_data;
b63534c4 2596
d886e185 2597 if (!req_has_async_data(req))
ab454438 2598 return !io_req_prep_async(req);
cd658695 2599 iov_iter_restore(&rw->iter, &rw->iter_state);
ab454438 2600 return true;
b63534c4 2601}
b63534c4 2602
3e6a0d3c 2603static bool io_rw_should_reissue(struct io_kiocb *req)
b63534c4 2604{
355afaeb 2605 umode_t mode = file_inode(req->file)->i_mode;
3e6a0d3c 2606 struct io_ring_ctx *ctx = req->ctx;
b63534c4 2607
355afaeb
JA
2608 if (!S_ISBLK(mode) && !S_ISREG(mode))
2609 return false;
3e6a0d3c
JA
2610 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2611 !(ctx->flags & IORING_SETUP_IOPOLL)))
b63534c4 2612 return false;
7c977a58
JA
2613 /*
2614 * If ref is dying, we might be running poll reap from the exit work.
2615 * Don't attempt to reissue from that path, just let it fail with
2616 * -EAGAIN.
2617 */
3e6a0d3c
JA
2618 if (percpu_ref_is_dying(&ctx->refs))
2619 return false;
ef046888
JA
2620 /*
2621 * Play it safe and assume not safe to re-import and reissue if we're
2622 * not in the original thread group (or in task context).
2623 */
2624 if (!same_thread_group(req->task, current) || !in_task())
2625 return false;
3e6a0d3c
JA
2626 return true;
2627}
e82ad485 2628#else
a1ff1e3f 2629static bool io_resubmit_prep(struct io_kiocb *req)
e82ad485
JA
2630{
2631 return false;
2632}
e82ad485 2633static bool io_rw_should_reissue(struct io_kiocb *req)
3e6a0d3c 2634{
b63534c4
JA
2635 return false;
2636}
3e6a0d3c 2637#endif
b63534c4 2638
8ef12efe 2639static bool __io_complete_rw_common(struct io_kiocb *req, long res)
a1d7c393 2640{
b65c128f
PB
2641 if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2642 kiocb_end_write(req);
9532b99b
PB
2643 if (res != req->result) {
2644 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2645 io_rw_should_reissue(req)) {
2646 req->flags |= REQ_F_REISSUE;
8ef12efe 2647 return true;
9532b99b 2648 }
93d2bcd2 2649 req_set_fail(req);
8ef12efe 2650 req->result = res;
9532b99b 2651 }
8ef12efe
JA
2652 return false;
2653}
2654
f237c30a 2655static void io_req_task_complete(struct io_kiocb *req, bool *locked)
8ef12efe 2656{
126180b9 2657 unsigned int cflags = io_put_rw_kbuf(req);
54daa9b2 2658 int res = req->result;
126180b9 2659
fff4e40e 2660 if (*locked) {
126180b9 2661 io_req_complete_state(req, res, cflags);
fff4e40e
PB
2662 io_req_add_compl_list(req);
2663 } else {
126180b9 2664 io_req_complete_post(req, res, cflags);
fff4e40e 2665 }
8ef12efe
JA
2666}
2667
2668static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2669 unsigned int issue_flags)
2670{
2671 if (__io_complete_rw_common(req, res))
2672 return;
63637853 2673 __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
ba816ad6
JA
2674}
2675
2676static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2677{
9adbd45d 2678 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6 2679
8ef12efe
JA
2680 if (__io_complete_rw_common(req, res))
2681 return;
2682 req->result = res;
2683 req->io_task_work.func = io_req_task_complete;
2684 io_req_task_work_add(req);
2b188cc1
JA
2685}
2686
def596e9
JA
2687static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2688{
9adbd45d 2689 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 2690
491381ce
JA
2691 if (kiocb->ki_flags & IOCB_WRITE)
2692 kiocb_end_write(req);
9532b99b 2693 if (unlikely(res != req->result)) {
b66ceaf3
PB
2694 if (res == -EAGAIN && io_rw_should_reissue(req)) {
2695 req->flags |= REQ_F_REISSUE;
2696 return;
9532b99b 2697 }
8c130827 2698 }
bbde017a 2699
b3fa03fd
PB
2700 req->result = res;
2701 /* order with io_iopoll_complete() checking ->iopoll_completed */
2702 smp_store_release(&req->iopoll_completed, 1);
def596e9
JA
2703}
2704
2705/*
2706 * After the iocb has been issued, it's safe to be found on the poll list.
2707 * Adding the kiocb to the list AFTER submission ensures that we don't
f39c8a5b 2708 * find it from a io_do_iopoll() thread before the issuer is done
def596e9
JA
2709 * accessing the kiocb cookie.
2710 */
cb3d8972 2711static void io_iopoll_req_issued(struct io_kiocb *req)
def596e9
JA
2712{
2713 struct io_ring_ctx *ctx = req->ctx;
cb3d8972
PB
2714 const bool in_async = io_wq_current_is_worker();
2715
2716 /* workqueue context doesn't hold uring_lock, grab it now */
2717 if (unlikely(in_async))
2718 mutex_lock(&ctx->uring_lock);
def596e9
JA
2719
2720 /*
2721 * Track whether we have multiple files in our lists. This will impact
2722 * how we do polling eventually, not spinning if we're on potentially
2723 * different devices.
2724 */
5eef4e87 2725 if (wq_list_empty(&ctx->iopoll_list)) {
915b3dde
HX
2726 ctx->poll_multi_queue = false;
2727 } else if (!ctx->poll_multi_queue) {
def596e9
JA
2728 struct io_kiocb *list_req;
2729
5eef4e87
PB
2730 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
2731 comp_list);
30da1b45 2732 if (list_req->file != req->file)
915b3dde 2733 ctx->poll_multi_queue = true;
def596e9
JA
2734 }
2735
2736 /*
2737 * For fast devices, IO may have already completed. If it has, add
2738 * it to the front so we find it first.
2739 */
65a6543d 2740 if (READ_ONCE(req->iopoll_completed))
5eef4e87 2741 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
def596e9 2742 else
5eef4e87 2743 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
bdcd3eab 2744
cb3d8972
PB
2745 if (unlikely(in_async)) {
2746 /*
2747 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2748 * in sq thread task context or in io worker task context. If
2749 * current task context is sq thread, we don't need to check
2750 * whether should wake up sq thread.
2751 */
2752 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2753 wq_has_sleeper(&ctx->sq_data->wait))
2754 wake_up(&ctx->sq_data->wait);
2755
2756 mutex_unlock(&ctx->uring_lock);
2757 }
def596e9
JA
2758}
2759
4503b767
JA
2760static bool io_bdev_nowait(struct block_device *bdev)
2761{
9ba0d0c8 2762 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
4503b767
JA
2763}
2764
2b188cc1
JA
2765/*
2766 * If we tracked the file through the SCM inflight mechanism, we could support
2767 * any file. For now, just ensure that anything potentially problematic is done
2768 * inline.
2769 */
b191e2df 2770static bool __io_file_supports_nowait(struct file *file, int rw)
2b188cc1
JA
2771{
2772 umode_t mode = file_inode(file)->i_mode;
2773
4503b767 2774 if (S_ISBLK(mode)) {
4e7b5671
CH
2775 if (IS_ENABLED(CONFIG_BLOCK) &&
2776 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
4503b767
JA
2777 return true;
2778 return false;
2779 }
976517f1 2780 if (S_ISSOCK(mode))
2b188cc1 2781 return true;
4503b767 2782 if (S_ISREG(mode)) {
4e7b5671
CH
2783 if (IS_ENABLED(CONFIG_BLOCK) &&
2784 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
4503b767
JA
2785 file->f_op != &io_uring_fops)
2786 return true;
2787 return false;
2788 }
2b188cc1 2789
c5b85625
JA
2790 /* any ->read/write should understand O_NONBLOCK */
2791 if (file->f_flags & O_NONBLOCK)
2792 return true;
2793
af197f50
JA
2794 if (!(file->f_mode & FMODE_NOWAIT))
2795 return false;
2796
2797 if (rw == READ)
2798 return file->f_op->read_iter != NULL;
2799
2800 return file->f_op->write_iter != NULL;
2b188cc1
JA
2801}
2802
b191e2df 2803static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
7b29f92d 2804{
b191e2df 2805 if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
7b29f92d 2806 return true;
b191e2df 2807 else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
7b29f92d
JA
2808 return true;
2809
b191e2df 2810 return __io_file_supports_nowait(req->file, rw);
7b29f92d
JA
2811}
2812
5d329e12
JA
2813static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2814 int rw)
2b188cc1 2815{
def596e9 2816 struct io_ring_ctx *ctx = req->ctx;
9adbd45d 2817 struct kiocb *kiocb = &req->rw.kiocb;
75c668cd 2818 struct file *file = req->file;
09bb8394
JA
2819 unsigned ioprio;
2820 int ret;
2b188cc1 2821
c97d8a0f 2822 if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
491381ce
JA
2823 req->flags |= REQ_F_ISREG;
2824
2b188cc1 2825 kiocb->ki_pos = READ_ONCE(sqe->off);
75c668cd 2826 if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
ba04291e 2827 req->flags |= REQ_F_CUR_POS;
75c668cd 2828 kiocb->ki_pos = file->f_pos;
ba04291e 2829 }
2b188cc1 2830 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
3e577dcd
PB
2831 kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2832 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2833 if (unlikely(ret))
2834 return ret;
2b188cc1 2835
5d329e12
JA
2836 /*
2837 * If the file is marked O_NONBLOCK, still allow retry for it if it
2838 * supports async. Otherwise it's impossible to use O_NONBLOCK files
2839 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
2840 */
2841 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2842 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
75c668cd
PB
2843 req->flags |= REQ_F_NOWAIT;
2844
2b188cc1
JA
2845 ioprio = READ_ONCE(sqe->ioprio);
2846 if (ioprio) {
2847 ret = ioprio_check_cap(ioprio);
2848 if (ret)
09bb8394 2849 return ret;
2b188cc1
JA
2850
2851 kiocb->ki_ioprio = ioprio;
2852 } else
2853 kiocb->ki_ioprio = get_current_ioprio();
2854
def596e9 2855 if (ctx->flags & IORING_SETUP_IOPOLL) {
def596e9
JA
2856 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2857 !kiocb->ki_filp->f_op->iopoll)
09bb8394 2858 return -EOPNOTSUPP;
2b188cc1 2859
394918eb 2860 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
def596e9 2861 kiocb->ki_complete = io_complete_rw_iopoll;
65a6543d 2862 req->iopoll_completed = 0;
def596e9 2863 } else {
09bb8394
JA
2864 if (kiocb->ki_flags & IOCB_HIPRI)
2865 return -EINVAL;
def596e9
JA
2866 kiocb->ki_complete = io_complete_rw;
2867 }
9adbd45d 2868
eae071c9
PB
2869 if (req->opcode == IORING_OP_READ_FIXED ||
2870 req->opcode == IORING_OP_WRITE_FIXED) {
2871 req->imu = NULL;
a46be971 2872 io_req_set_rsrc_node(req, ctx);
eae071c9
PB
2873 }
2874
3529d8c2
JA
2875 req->rw.addr = READ_ONCE(sqe->addr);
2876 req->rw.len = READ_ONCE(sqe->len);
4f4eeba8 2877 req->buf_index = READ_ONCE(sqe->buf_index);
2b188cc1 2878 return 0;
2b188cc1
JA
2879}
2880
2881static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2882{
2883 switch (ret) {
2884 case -EIOCBQUEUED:
2885 break;
2886 case -ERESTARTSYS:
2887 case -ERESTARTNOINTR:
2888 case -ERESTARTNOHAND:
2889 case -ERESTART_RESTARTBLOCK:
2890 /*
2891 * We can't just restart the syscall, since previously
2892 * submitted sqes may already be in progress. Just fail this
2893 * IO with EINTR.
2894 */
2895 ret = -EINTR;
df561f66 2896 fallthrough;
2b188cc1
JA
2897 default:
2898 kiocb->ki_complete(kiocb, ret, 0);
2899 }
2900}
2901
a1d7c393 2902static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
889fca73 2903 unsigned int issue_flags)
ba816ad6 2904{
ba04291e 2905 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
e8c2bc1f 2906 struct io_async_rw *io = req->async_data;
ba04291e 2907
227c0c96 2908 /* add previously done IO, if any */
d886e185 2909 if (req_has_async_data(req) && io->bytes_done > 0) {
227c0c96 2910 if (ret < 0)
e8c2bc1f 2911 ret = io->bytes_done;
227c0c96 2912 else
e8c2bc1f 2913 ret += io->bytes_done;
227c0c96
JA
2914 }
2915
ba04291e
JA
2916 if (req->flags & REQ_F_CUR_POS)
2917 req->file->f_pos = kiocb->ki_pos;
b66ceaf3 2918 if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
889fca73 2919 __io_complete_rw(req, ret, 0, issue_flags);
ba816ad6
JA
2920 else
2921 io_rw_done(kiocb, ret);
97284637 2922
b66ceaf3 2923 if (req->flags & REQ_F_REISSUE) {
97284637 2924 req->flags &= ~REQ_F_REISSUE;
a7be7c23 2925 if (io_resubmit_prep(req)) {
773af691 2926 io_req_task_queue_reissue(req);
8c130827 2927 } else {
b66ceaf3
PB
2928 unsigned int cflags = io_put_rw_kbuf(req);
2929 struct io_ring_ctx *ctx = req->ctx;
2930
93d2bcd2 2931 req_set_fail(req);
14cfbb7a 2932 if (!(issue_flags & IO_URING_F_NONBLOCK)) {
b66ceaf3
PB
2933 mutex_lock(&ctx->uring_lock);
2934 __io_req_complete(req, issue_flags, ret, cflags);
2935 mutex_unlock(&ctx->uring_lock);
2936 } else {
2937 __io_req_complete(req, issue_flags, ret, cflags);
2938 }
97284637
PB
2939 }
2940 }
ba816ad6
JA
2941}
2942
eae071c9
PB
2943static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
2944 struct io_mapped_ubuf *imu)
edafccee 2945{
9adbd45d 2946 size_t len = req->rw.len;
75769e3f 2947 u64 buf_end, buf_addr = req->rw.addr;
edafccee 2948 size_t offset;
edafccee 2949
75769e3f 2950 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
edafccee
JA
2951 return -EFAULT;
2952 /* not inside the mapped region */
4751f53d 2953 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
edafccee
JA
2954 return -EFAULT;
2955
2956 /*
2957 * May not be a start of buffer, set size appropriately
2958 * and advance us to the beginning.
2959 */
2960 offset = buf_addr - imu->ubuf;
2961 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
2962
2963 if (offset) {
2964 /*
2965 * Don't use iov_iter_advance() here, as it's really slow for
2966 * using the latter parts of a big fixed buffer - it iterates
2967 * over each segment manually. We can cheat a bit here, because
2968 * we know that:
2969 *
2970 * 1) it's a BVEC iter, we set it up
2971 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2972 * first and last bvec
2973 *
2974 * So just find our index, and adjust the iterator afterwards.
2975 * If the offset is within the first bvec (or the whole first
2976 * bvec, just use iov_iter_advance(). This makes it easier
2977 * since we can just skip the first segment, which may not
2978 * be PAGE_SIZE aligned.
2979 */
2980 const struct bio_vec *bvec = imu->bvec;
2981
2982 if (offset <= bvec->bv_len) {
2983 iov_iter_advance(iter, offset);
2984 } else {
2985 unsigned long seg_skip;
2986
2987 /* skip first vec */
2988 offset -= bvec->bv_len;
2989 seg_skip = 1 + (offset >> PAGE_SHIFT);
2990
2991 iter->bvec = bvec + seg_skip;
2992 iter->nr_segs -= seg_skip;
99c79f66 2993 iter->count -= bvec->bv_len + offset;
bd11b3a3 2994 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
2995 }
2996 }
2997
847595de 2998 return 0;
edafccee
JA
2999}
3000
eae071c9
PB
3001static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
3002{
3003 struct io_ring_ctx *ctx = req->ctx;
3004 struct io_mapped_ubuf *imu = req->imu;
3005 u16 index, buf_index = req->buf_index;
3006
3007 if (likely(!imu)) {
3008 if (unlikely(buf_index >= ctx->nr_user_bufs))
3009 return -EFAULT;
3010 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3011 imu = READ_ONCE(ctx->user_bufs[index]);
3012 req->imu = imu;
3013 }
3014 return __io_import_fixed(req, rw, iter, imu);
3015}
3016
bcda7baa
JA
3017static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
3018{
3019 if (needs_lock)
3020 mutex_unlock(&ctx->uring_lock);
3021}
3022
3023static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
3024{
3025 /*
3026 * "Normal" inline submissions always hold the uring_lock, since we
3027 * grab it from the system call. Same is true for the SQPOLL offload.
3028 * The only exception is when we've detached the request and issue it
3029 * from an async worker thread, grab the lock for that case.
3030 */
3031 if (needs_lock)
3032 mutex_lock(&ctx->uring_lock);
3033}
3034
3035static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
30d51dd4 3036 int bgid, bool needs_lock)
bcda7baa 3037{
30d51dd4 3038 struct io_buffer *kbuf = req->kbuf;
bcda7baa
JA
3039 struct io_buffer *head;
3040
3041 if (req->flags & REQ_F_BUFFER_SELECTED)
3042 return kbuf;
3043
3044 io_ring_submit_lock(req->ctx, needs_lock);
3045
3046 lockdep_assert_held(&req->ctx->uring_lock);
3047
9e15c3a0 3048 head = xa_load(&req->ctx->io_buffers, bgid);
bcda7baa
JA
3049 if (head) {
3050 if (!list_empty(&head->list)) {
3051 kbuf = list_last_entry(&head->list, struct io_buffer,
3052 list);
3053 list_del(&kbuf->list);
3054 } else {
3055 kbuf = head;
9e15c3a0 3056 xa_erase(&req->ctx->io_buffers, bgid);
bcda7baa
JA
3057 }
3058 if (*len > kbuf->len)
3059 *len = kbuf->len;
30d51dd4
PB
3060 req->flags |= REQ_F_BUFFER_SELECTED;
3061 req->kbuf = kbuf;
bcda7baa
JA
3062 } else {
3063 kbuf = ERR_PTR(-ENOBUFS);
3064 }
3065
3066 io_ring_submit_unlock(req->ctx, needs_lock);
bcda7baa
JA
3067 return kbuf;
3068}
3069
4d954c25
JA
3070static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3071 bool needs_lock)
3072{
3073 struct io_buffer *kbuf;
4f4eeba8 3074 u16 bgid;
4d954c25 3075
4f4eeba8 3076 bgid = req->buf_index;
30d51dd4 3077 kbuf = io_buffer_select(req, len, bgid, needs_lock);
4d954c25
JA
3078 if (IS_ERR(kbuf))
3079 return kbuf;
4d954c25
JA
3080 return u64_to_user_ptr(kbuf->addr);
3081}
3082
3083#ifdef CONFIG_COMPAT
3084static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3085 bool needs_lock)
3086{
3087 struct compat_iovec __user *uiov;
3088 compat_ssize_t clen;
3089 void __user *buf;
3090 ssize_t len;
3091
3092 uiov = u64_to_user_ptr(req->rw.addr);
3093 if (!access_ok(uiov, sizeof(*uiov)))
3094 return -EFAULT;
3095 if (__get_user(clen, &uiov->iov_len))
3096 return -EFAULT;
3097 if (clen < 0)
3098 return -EINVAL;
3099
3100 len = clen;
3101 buf = io_rw_buffer_select(req, &len, needs_lock);
3102 if (IS_ERR(buf))
3103 return PTR_ERR(buf);
3104 iov[0].iov_base = buf;
3105 iov[0].iov_len = (compat_size_t) len;
3106 return 0;
3107}
3108#endif
3109
3110static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3111 bool needs_lock)
3112{
3113 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3114 void __user *buf;
3115 ssize_t len;
3116
3117 if (copy_from_user(iov, uiov, sizeof(*uiov)))
3118 return -EFAULT;
3119
3120 len = iov[0].iov_len;
3121 if (len < 0)
3122 return -EINVAL;
3123 buf = io_rw_buffer_select(req, &len, needs_lock);
3124 if (IS_ERR(buf))
3125 return PTR_ERR(buf);
3126 iov[0].iov_base = buf;
3127 iov[0].iov_len = len;
3128 return 0;
3129}
3130
3131static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3132 bool needs_lock)
3133{
dddb3e26 3134 if (req->flags & REQ_F_BUFFER_SELECTED) {
30d51dd4 3135 struct io_buffer *kbuf = req->kbuf;
dddb3e26 3136
dddb3e26
JA
3137 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3138 iov[0].iov_len = kbuf->len;
4d954c25 3139 return 0;
dddb3e26 3140 }
dd201662 3141 if (req->rw.len != 1)
4d954c25
JA
3142 return -EINVAL;
3143
3144#ifdef CONFIG_COMPAT
3145 if (req->ctx->compat)
3146 return io_compat_import(req, iov, needs_lock);
3147#endif
3148
3149 return __io_iov_buffer_select(req, iov, needs_lock);
3150}
3151
847595de
PB
3152static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
3153 struct iov_iter *iter, bool needs_lock)
2b188cc1 3154{
9adbd45d
JA
3155 void __user *buf = u64_to_user_ptr(req->rw.addr);
3156 size_t sqe_len = req->rw.len;
847595de 3157 u8 opcode = req->opcode;
4d954c25 3158 ssize_t ret;
edafccee 3159
7d009165 3160 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
edafccee 3161 *iovec = NULL;
9adbd45d 3162 return io_import_fixed(req, rw, iter);
edafccee 3163 }
2b188cc1 3164
bcda7baa 3165 /* buffer index only valid with fixed read/write, or buffer select */
4f4eeba8 3166 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
9adbd45d
JA
3167 return -EINVAL;
3168
3a6820f2 3169 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
bcda7baa 3170 if (req->flags & REQ_F_BUFFER_SELECT) {
4d954c25 3171 buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
867a23ea 3172 if (IS_ERR(buf))
4d954c25 3173 return PTR_ERR(buf);
3f9d6441 3174 req->rw.len = sqe_len;
bcda7baa
JA
3175 }
3176
3a6820f2
JA
3177 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3178 *iovec = NULL;
10fc72e4 3179 return ret;
3a6820f2
JA
3180 }
3181
4d954c25
JA
3182 if (req->flags & REQ_F_BUFFER_SELECT) {
3183 ret = io_iov_buffer_select(req, *iovec, needs_lock);
847595de
PB
3184 if (!ret)
3185 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
4d954c25
JA
3186 *iovec = NULL;
3187 return ret;
3188 }
3189
89cd35c5
CH
3190 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3191 req->ctx->compat);
2b188cc1
JA
3192}
3193
0fef9483
JA
3194static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3195{
5b09e37e 3196 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
0fef9483
JA
3197}
3198
31b51510 3199/*
32960613
JA
3200 * For files that don't have ->read_iter() and ->write_iter(), handle them
3201 * by looping over ->read() or ->write() manually.
31b51510 3202 */
4017eb91 3203static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
32960613 3204{
4017eb91
JA
3205 struct kiocb *kiocb = &req->rw.kiocb;
3206 struct file *file = req->file;
32960613
JA
3207 ssize_t ret = 0;
3208
3209 /*
3210 * Don't support polled IO through this interface, and we can't
3211 * support non-blocking either. For the latter, this just causes
3212 * the kiocb to be handled from an async context.
3213 */
3214 if (kiocb->ki_flags & IOCB_HIPRI)
3215 return -EOPNOTSUPP;
3216 if (kiocb->ki_flags & IOCB_NOWAIT)
3217 return -EAGAIN;
3218
3219 while (iov_iter_count(iter)) {
311ae9e1 3220 struct iovec iovec;
32960613
JA
3221 ssize_t nr;
3222
311ae9e1
PB
3223 if (!iov_iter_is_bvec(iter)) {
3224 iovec = iov_iter_iovec(iter);
3225 } else {
4017eb91
JA
3226 iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3227 iovec.iov_len = req->rw.len;
311ae9e1
PB
3228 }
3229
32960613
JA
3230 if (rw == READ) {
3231 nr = file->f_op->read(file, iovec.iov_base,
0fef9483 3232 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3233 } else {
3234 nr = file->f_op->write(file, iovec.iov_base,
0fef9483 3235 iovec.iov_len, io_kiocb_ppos(kiocb));
32960613
JA
3236 }
3237
3238 if (nr < 0) {
3239 if (!ret)
3240 ret = nr;
3241 break;
3242 }
16c8d2df
JA
3243 if (!iov_iter_is_bvec(iter)) {
3244 iov_iter_advance(iter, nr);
3245 } else {
3246 req->rw.len -= nr;
3247 req->rw.addr += nr;
3248 }
32960613
JA
3249 ret += nr;
3250 if (nr != iovec.iov_len)
3251 break;
32960613
JA
3252 }
3253
3254 return ret;
3255}
3256
ff6165b2
JA
3257static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3258 const struct iovec *fast_iov, struct iov_iter *iter)
f67676d1 3259{
e8c2bc1f 3260 struct io_async_rw *rw = req->async_data;
b64e3444 3261
ff6165b2 3262 memcpy(&rw->iter, iter, sizeof(*iter));
afb87658 3263 rw->free_iovec = iovec;
227c0c96 3264 rw->bytes_done = 0;
ff6165b2 3265 /* can only be fixed buffers, no need to do anything */
9c3a205c 3266 if (iov_iter_is_bvec(iter))
ff6165b2 3267 return;
b64e3444 3268 if (!iovec) {
ff6165b2
JA
3269 unsigned iov_off = 0;
3270
3271 rw->iter.iov = rw->fast_iov;
3272 if (iter->iov != fast_iov) {
3273 iov_off = iter->iov - fast_iov;
3274 rw->iter.iov += iov_off;
3275 }
3276 if (rw->fast_iov != fast_iov)
3277 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
45097dae 3278 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
3279 } else {
3280 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
3281 }
3282}
3283
8d4af685 3284static inline bool io_alloc_async_data(struct io_kiocb *req)
3d9932a8 3285{
e8c2bc1f
JA
3286 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3287 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
d886e185
PB
3288 if (req->async_data) {
3289 req->flags |= REQ_F_ASYNC_DATA;
3290 return false;
3291 }
3292 return true;
3d9932a8
XW
3293}
3294
ff6165b2
JA
3295static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3296 const struct iovec *fast_iov,
227c0c96 3297 struct iov_iter *iter, bool force)
b7bb4f7d 3298{
26f0505a 3299 if (!force && !io_op_defs[req->opcode].needs_async_setup)
74566df3 3300 return 0;
d886e185 3301 if (!req_has_async_data(req)) {
cd658695
JA
3302 struct io_async_rw *iorw;
3303
6cb78689 3304 if (io_alloc_async_data(req)) {
6bf985dc 3305 kfree(iovec);
5d204bcf 3306 return -ENOMEM;
6bf985dc 3307 }
b7bb4f7d 3308
ff6165b2 3309 io_req_map_rw(req, iovec, fast_iov, iter);
cd658695
JA
3310 iorw = req->async_data;
3311 /* we've copied and mapped the iter, ensure state is saved */
3312 iov_iter_save_state(&iorw->iter, &iorw->iter_state);
5d204bcf 3313 }
b7bb4f7d 3314 return 0;
f67676d1
JA
3315}
3316
73debe68 3317static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
c3e330a4 3318{
e8c2bc1f 3319 struct io_async_rw *iorw = req->async_data;
f4bff104 3320 struct iovec *iov = iorw->fast_iov;
847595de 3321 int ret;
c3e330a4 3322
2846c481 3323 ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
c3e330a4
PB
3324 if (unlikely(ret < 0))
3325 return ret;
3326
ab0b196c
PB
3327 iorw->bytes_done = 0;
3328 iorw->free_iovec = iov;
3329 if (iov)
3330 req->flags |= REQ_F_NEED_CLEANUP;
cd658695 3331 iov_iter_save_state(&iorw->iter, &iorw->iter_state);
c3e330a4
PB
3332 return 0;
3333}
3334
73debe68 3335static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3336{
3529d8c2
JA
3337 if (unlikely(!(req->file->f_mode & FMODE_READ)))
3338 return -EBADF;
5d329e12 3339 return io_prep_rw(req, sqe, READ);
f67676d1
JA
3340}
3341
c1dd91d1
JA
3342/*
3343 * This is our waitqueue callback handler, registered through lock_page_async()
3344 * when we initially tried to do the IO with the iocb armed our waitqueue.
3345 * This gets called when the page is unlocked, and we generally expect that to
3346 * happen when the page IO is completed and the page is now uptodate. This will
3347 * queue a task_work based retry of the operation, attempting to copy the data
3348 * again. If the latter fails because the page was NOT uptodate, then we will
3349 * do a thread based blocking retry of the operation. That's the unexpected
3350 * slow path.
3351 */
bcf5a063
JA
3352static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3353 int sync, void *arg)
3354{
3355 struct wait_page_queue *wpq;
3356 struct io_kiocb *req = wait->private;
bcf5a063 3357 struct wait_page_key *key = arg;
bcf5a063
JA
3358
3359 wpq = container_of(wait, struct wait_page_queue, wait);
3360
cdc8fcb4
LT
3361 if (!wake_page_match(wpq, key))
3362 return 0;
3363
c8d317aa 3364 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
bcf5a063 3365 list_del_init(&wait->entry);
921b9054 3366 io_req_task_queue(req);
bcf5a063
JA
3367 return 1;
3368}
3369
c1dd91d1
JA
3370/*
3371 * This controls whether a given IO request should be armed for async page
3372 * based retry. If we return false here, the request is handed to the async
3373 * worker threads for retry. If we're doing buffered reads on a regular file,
3374 * we prepare a private wait_page_queue entry and retry the operation. This
3375 * will either succeed because the page is now uptodate and unlocked, or it
3376 * will register a callback when the page is unlocked at IO completion. Through
3377 * that callback, io_uring uses task_work to setup a retry of the operation.
3378 * That retry will attempt the buffered read again. The retry will generally
3379 * succeed, or in rare cases where it fails, we then fall back to using the
3380 * async worker threads for a blocking retry.
3381 */
227c0c96 3382static bool io_rw_should_retry(struct io_kiocb *req)
f67676d1 3383{
e8c2bc1f
JA
3384 struct io_async_rw *rw = req->async_data;
3385 struct wait_page_queue *wait = &rw->wpq;
bcf5a063 3386 struct kiocb *kiocb = &req->rw.kiocb;
f67676d1 3387
bcf5a063
JA
3388 /* never retry for NOWAIT, we just complete with -EAGAIN */
3389 if (req->flags & REQ_F_NOWAIT)
3390 return false;
f67676d1 3391
227c0c96 3392 /* Only for buffered IO */
3b2a4439 3393 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
bcf5a063 3394 return false;
3b2a4439 3395
bcf5a063
JA
3396 /*
3397 * just use poll if we can, and don't attempt if the fs doesn't
3398 * support callback based unlocks
3399 */
3400 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3401 return false;
f67676d1 3402
3b2a4439
JA
3403 wait->wait.func = io_async_buf_func;
3404 wait->wait.private = req;
3405 wait->wait.flags = 0;
3406 INIT_LIST_HEAD(&wait->wait.entry);
3407 kiocb->ki_flags |= IOCB_WAITQ;
c8d317aa 3408 kiocb->ki_flags &= ~IOCB_NOWAIT;
3b2a4439 3409 kiocb->ki_waitq = wait;
3b2a4439 3410 return true;
bcf5a063
JA
3411}
3412
aeab9506 3413static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
bcf5a063
JA
3414{
3415 if (req->file->f_op->read_iter)
3416 return call_read_iter(req->file, &req->rw.kiocb, iter);
2dd2111d 3417 else if (req->file->f_op->read)
4017eb91 3418 return loop_rw_iter(READ, req, iter);
2dd2111d
GH
3419 else
3420 return -EINVAL;
f67676d1
JA
3421}
3422
7db30437
ML
3423static bool need_read_all(struct io_kiocb *req)
3424{
3425 return req->flags & REQ_F_ISREG ||
3426 S_ISBLK(file_inode(req->file)->i_mode);
3427}
3428
889fca73 3429static int io_read(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3430{
3431 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3432 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3433 struct iov_iter __iter, *iter = &__iter;
45d189c6 3434 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
cd658695 3435 struct iov_iter_state __state, *state;
d886e185 3436 struct io_async_rw *rw;
cd658695 3437 ssize_t ret, ret2;
ff6165b2 3438
d886e185
PB
3439 if (req_has_async_data(req)) {
3440 rw = req->async_data;
e8c2bc1f 3441 iter = &rw->iter;
cd658695
JA
3442 state = &rw->iter_state;
3443 /*
3444 * We come here from an earlier attempt, restore our state to
3445 * match in case it doesn't. It's cheap enough that we don't
3446 * need to make this conditional.
3447 */
3448 iov_iter_restore(iter, state);
2846c481
PB
3449 iovec = NULL;
3450 } else {
3451 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3452 if (ret < 0)
3453 return ret;
cd658695
JA
3454 state = &__state;
3455 iov_iter_save_state(iter, state);
2846c481 3456 }
cd658695 3457 req->result = iov_iter_count(iter);
2b188cc1 3458
fd6c2e4c
JA
3459 /* Ensure we clear previously set non-block flag */
3460 if (!force_nonblock)
29de5f6a 3461 kiocb->ki_flags &= ~IOCB_NOWAIT;
a88fc400
PB
3462 else
3463 kiocb->ki_flags |= IOCB_NOWAIT;
3464
24c74678 3465 /* If the file doesn't support async, just async punt */
b191e2df 3466 if (force_nonblock && !io_file_supports_nowait(req, READ)) {
6713e7a6 3467 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc 3468 return ret ?: -EAGAIN;
6713e7a6 3469 }
9e645e11 3470
cd658695 3471 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
5ea5dd45
PB
3472 if (unlikely(ret)) {
3473 kfree(iovec);
3474 return ret;
3475 }
2b188cc1 3476
227c0c96 3477 ret = io_iter_do_read(req, iter);
32960613 3478
230d50d4 3479 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
6ad7f233 3480 req->flags &= ~REQ_F_REISSUE;
eefdf30f
JA
3481 /* IOPOLL retry should happen for io-wq threads */
3482 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
f91daf56 3483 goto done;
75c668cd
PB
3484 /* no retry on NONBLOCK nor RWF_NOWAIT */
3485 if (req->flags & REQ_F_NOWAIT)
355afaeb 3486 goto done;
f38c7e3a 3487 ret = 0;
230d50d4
JA
3488 } else if (ret == -EIOCBQUEUED) {
3489 goto out_free;
cd658695 3490 } else if (ret <= 0 || ret == req->result || !force_nonblock ||
7db30437 3491 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
7335e3bf 3492 /* read all, failed, already did sync or don't want to retry */
00d23d51 3493 goto done;
227c0c96
JA
3494 }
3495
cd658695
JA
3496 /*
3497 * Don't depend on the iter state matching what was consumed, or being
3498 * untouched in case of error. Restore it and we'll advance it
3499 * manually if we need to.
3500 */
3501 iov_iter_restore(iter, state);
3502
227c0c96 3503 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
6bf985dc
PB
3504 if (ret2)
3505 return ret2;
3506
fe1cdd55 3507 iovec = NULL;
e8c2bc1f 3508 rw = req->async_data;
cd658695
JA
3509 /*
3510 * Now use our persistent iterator and state, if we aren't already.
3511 * We've restored and mapped the iter to match.
3512 */
3513 if (iter != &rw->iter) {
3514 iter = &rw->iter;
3515 state = &rw->iter_state;
3516 }
227c0c96 3517
b23df91b 3518 do {
cd658695
JA
3519 /*
3520 * We end up here because of a partial read, either from
3521 * above or inside this loop. Advance the iter by the bytes
3522 * that were consumed.
3523 */
3524 iov_iter_advance(iter, ret);
3525 if (!iov_iter_count(iter))
3526 break;
b23df91b 3527 rw->bytes_done += ret;
cd658695
JA
3528 iov_iter_save_state(iter, state);
3529
b23df91b
PB
3530 /* if we can retry, do so with the callbacks armed */
3531 if (!io_rw_should_retry(req)) {
3532 kiocb->ki_flags &= ~IOCB_WAITQ;
3533 return -EAGAIN;
3534 }
3535
3536 /*
3537 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3538 * we get -EIOCBQUEUED, then we'll get a notification when the
3539 * desired page gets unlocked. We can also get a partial read
3540 * here, and if we do, then just retry at the new offset.
3541 */
3542 ret = io_iter_do_read(req, iter);
3543 if (ret == -EIOCBQUEUED)
3544 return 0;
227c0c96 3545 /* we got some bytes, but not all. retry. */
b5b0ecb7 3546 kiocb->ki_flags &= ~IOCB_WAITQ;
cd658695
JA
3547 iov_iter_restore(iter, state);
3548 } while (ret > 0);
227c0c96 3549done:
889fca73 3550 kiocb_done(kiocb, ret, issue_flags);
fe1cdd55
PB
3551out_free:
3552 /* it's faster to check here then delegate to kfree */
3553 if (iovec)
3554 kfree(iovec);
5ea5dd45 3555 return 0;
2b188cc1
JA
3556}
3557
73debe68 3558static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 3559{
3529d8c2
JA
3560 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3561 return -EBADF;
5d329e12 3562 return io_prep_rw(req, sqe, WRITE);
f67676d1
JA
3563}
3564
889fca73 3565static int io_write(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
3566{
3567 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
9adbd45d 3568 struct kiocb *kiocb = &req->rw.kiocb;
ff6165b2 3569 struct iov_iter __iter, *iter = &__iter;
45d189c6 3570 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
cd658695 3571 struct iov_iter_state __state, *state;
d886e185 3572 struct io_async_rw *rw;
cd658695 3573 ssize_t ret, ret2;
2b188cc1 3574
d886e185
PB
3575 if (req_has_async_data(req)) {
3576 rw = req->async_data;
e8c2bc1f 3577 iter = &rw->iter;
cd658695
JA
3578 state = &rw->iter_state;
3579 iov_iter_restore(iter, state);
2846c481
PB
3580 iovec = NULL;
3581 } else {
3582 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3583 if (ret < 0)
3584 return ret;
cd658695
JA
3585 state = &__state;
3586 iov_iter_save_state(iter, state);
2846c481 3587 }
cd658695 3588 req->result = iov_iter_count(iter);
2b188cc1 3589
fd6c2e4c
JA
3590 /* Ensure we clear previously set non-block flag */
3591 if (!force_nonblock)
a88fc400
PB
3592 kiocb->ki_flags &= ~IOCB_NOWAIT;
3593 else
3594 kiocb->ki_flags |= IOCB_NOWAIT;
fd6c2e4c 3595
24c74678 3596 /* If the file doesn't support async, just async punt */
b191e2df 3597 if (force_nonblock && !io_file_supports_nowait(req, WRITE))
f67676d1 3598 goto copy_iov;
31b51510 3599
10d59345
JA
3600 /* file path doesn't support NOWAIT for non-direct_IO */
3601 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3602 (req->flags & REQ_F_ISREG))
f67676d1 3603 goto copy_iov;
31b51510 3604
cd658695 3605 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
fa15bafb
PB
3606 if (unlikely(ret))
3607 goto out_free;
4ed734b0 3608
fa15bafb
PB
3609 /*
3610 * Open-code file_start_write here to grab freeze protection,
3611 * which will be released by another thread in
3612 * io_complete_rw(). Fool lockdep by telling it the lock got
3613 * released so that it doesn't complain about the held lock when
3614 * we return to userspace.
3615 */
3616 if (req->flags & REQ_F_ISREG) {
8a3c84b6 3617 sb_start_write(file_inode(req->file)->i_sb);
fa15bafb
PB
3618 __sb_writers_release(file_inode(req->file)->i_sb,
3619 SB_FREEZE_WRITE);
3620 }
3621 kiocb->ki_flags |= IOCB_WRITE;
4ed734b0 3622
fa15bafb 3623 if (req->file->f_op->write_iter)
ff6165b2 3624 ret2 = call_write_iter(req->file, kiocb, iter);
2dd2111d 3625 else if (req->file->f_op->write)
4017eb91 3626 ret2 = loop_rw_iter(WRITE, req, iter);
2dd2111d
GH
3627 else
3628 ret2 = -EINVAL;
4ed734b0 3629
6ad7f233
PB
3630 if (req->flags & REQ_F_REISSUE) {
3631 req->flags &= ~REQ_F_REISSUE;
230d50d4 3632 ret2 = -EAGAIN;
6ad7f233 3633 }
230d50d4 3634
fa15bafb
PB
3635 /*
3636 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3637 * retry them without IOCB_NOWAIT.
3638 */
3639 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3640 ret2 = -EAGAIN;
75c668cd
PB
3641 /* no retry on NONBLOCK nor RWF_NOWAIT */
3642 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
355afaeb 3643 goto done;
fa15bafb 3644 if (!force_nonblock || ret2 != -EAGAIN) {
eefdf30f
JA
3645 /* IOPOLL retry should happen for io-wq threads */
3646 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3647 goto copy_iov;
355afaeb 3648done:
889fca73 3649 kiocb_done(kiocb, ret2, issue_flags);
fa15bafb 3650 } else {
f67676d1 3651copy_iov:
cd658695 3652 iov_iter_restore(iter, state);
227c0c96 3653 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
6bf985dc 3654 return ret ?: -EAGAIN;
2b188cc1 3655 }
31b51510 3656out_free:
f261c168 3657 /* it's reportedly faster than delegating the null check to kfree() */
252917c3 3658 if (iovec)
6f2cc166 3659 kfree(iovec);
2b188cc1
JA
3660 return ret;
3661}
3662
80a261fd
JA
3663static int io_renameat_prep(struct io_kiocb *req,
3664 const struct io_uring_sqe *sqe)
3665{
3666 struct io_rename *ren = &req->rename;
3667 const char __user *oldf, *newf;
3668
ed7eb259
JA
3669 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3670 return -EINVAL;
26578cda 3671 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
ed7eb259 3672 return -EINVAL;
80a261fd
JA
3673 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3674 return -EBADF;
3675
3676 ren->old_dfd = READ_ONCE(sqe->fd);
3677 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3678 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3679 ren->new_dfd = READ_ONCE(sqe->len);
3680 ren->flags = READ_ONCE(sqe->rename_flags);
3681
3682 ren->oldpath = getname(oldf);
3683 if (IS_ERR(ren->oldpath))
3684 return PTR_ERR(ren->oldpath);
3685
3686 ren->newpath = getname(newf);
3687 if (IS_ERR(ren->newpath)) {
3688 putname(ren->oldpath);
3689 return PTR_ERR(ren->newpath);
3690 }
3691
3692 req->flags |= REQ_F_NEED_CLEANUP;
3693 return 0;
3694}
3695
45d189c6 3696static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
80a261fd
JA
3697{
3698 struct io_rename *ren = &req->rename;
3699 int ret;
3700
45d189c6 3701 if (issue_flags & IO_URING_F_NONBLOCK)
80a261fd
JA
3702 return -EAGAIN;
3703
3704 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3705 ren->newpath, ren->flags);
3706
3707 req->flags &= ~REQ_F_NEED_CLEANUP;
3708 if (ret < 0)
93d2bcd2 3709 req_set_fail(req);
80a261fd
JA
3710 io_req_complete(req, ret);
3711 return 0;
3712}
3713
14a1143b
JA
3714static int io_unlinkat_prep(struct io_kiocb *req,
3715 const struct io_uring_sqe *sqe)
3716{
3717 struct io_unlink *un = &req->unlink;
3718 const char __user *fname;
3719
22634bc5
JA
3720 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3721 return -EINVAL;
26578cda
PB
3722 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3723 sqe->splice_fd_in)
22634bc5 3724 return -EINVAL;
14a1143b
JA
3725 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3726 return -EBADF;
3727
3728 un->dfd = READ_ONCE(sqe->fd);
3729
3730 un->flags = READ_ONCE(sqe->unlink_flags);
3731 if (un->flags & ~AT_REMOVEDIR)
3732 return -EINVAL;
3733
3734 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3735 un->filename = getname(fname);
3736 if (IS_ERR(un->filename))
3737 return PTR_ERR(un->filename);
3738
3739 req->flags |= REQ_F_NEED_CLEANUP;
3740 return 0;
3741}
3742
45d189c6 3743static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
14a1143b
JA
3744{
3745 struct io_unlink *un = &req->unlink;
3746 int ret;
3747
45d189c6 3748 if (issue_flags & IO_URING_F_NONBLOCK)
14a1143b
JA
3749 return -EAGAIN;
3750
3751 if (un->flags & AT_REMOVEDIR)
3752 ret = do_rmdir(un->dfd, un->filename);
3753 else
3754 ret = do_unlinkat(un->dfd, un->filename);
3755
3756 req->flags &= ~REQ_F_NEED_CLEANUP;
3757 if (ret < 0)
93d2bcd2 3758 req_set_fail(req);
14a1143b
JA
3759 io_req_complete(req, ret);
3760 return 0;
3761}
3762
e34a02dc
DK
3763static int io_mkdirat_prep(struct io_kiocb *req,
3764 const struct io_uring_sqe *sqe)
3765{
3766 struct io_mkdir *mkd = &req->mkdir;
3767 const char __user *fname;
3768
3769 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3770 return -EINVAL;
3771 if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
3772 sqe->splice_fd_in)
3773 return -EINVAL;
3774 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3775 return -EBADF;
3776
3777 mkd->dfd = READ_ONCE(sqe->fd);
3778 mkd->mode = READ_ONCE(sqe->len);
3779
3780 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3781 mkd->filename = getname(fname);
3782 if (IS_ERR(mkd->filename))
3783 return PTR_ERR(mkd->filename);
3784
3785 req->flags |= REQ_F_NEED_CLEANUP;
3786 return 0;
3787}
3788
04f34081 3789static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
e34a02dc
DK
3790{
3791 struct io_mkdir *mkd = &req->mkdir;
3792 int ret;
3793
3794 if (issue_flags & IO_URING_F_NONBLOCK)
3795 return -EAGAIN;
3796
3797 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
3798
3799 req->flags &= ~REQ_F_NEED_CLEANUP;
3800 if (ret < 0)
3801 req_set_fail(req);
3802 io_req_complete(req, ret);
3803 return 0;
3804}
3805
7a8721f8
DK
3806static int io_symlinkat_prep(struct io_kiocb *req,
3807 const struct io_uring_sqe *sqe)
3808{
3809 struct io_symlink *sl = &req->symlink;
3810 const char __user *oldpath, *newpath;
3811
3812 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3813 return -EINVAL;
3814 if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
3815 sqe->splice_fd_in)
3816 return -EINVAL;
3817 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3818 return -EBADF;
3819
3820 sl->new_dfd = READ_ONCE(sqe->fd);
3821 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
3822 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3823
3824 sl->oldpath = getname(oldpath);
3825 if (IS_ERR(sl->oldpath))
3826 return PTR_ERR(sl->oldpath);
3827
3828 sl->newpath = getname(newpath);
3829 if (IS_ERR(sl->newpath)) {
3830 putname(sl->oldpath);
3831 return PTR_ERR(sl->newpath);
3832 }
3833
3834 req->flags |= REQ_F_NEED_CLEANUP;
3835 return 0;
3836}
3837
04f34081 3838static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
7a8721f8
DK
3839{
3840 struct io_symlink *sl = &req->symlink;
3841 int ret;
3842
3843 if (issue_flags & IO_URING_F_NONBLOCK)
3844 return -EAGAIN;
3845
3846 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
3847
3848 req->flags &= ~REQ_F_NEED_CLEANUP;
3849 if (ret < 0)
3850 req_set_fail(req);
3851 io_req_complete(req, ret);
3852 return 0;
3853}
3854
cf30da90
DK
3855static int io_linkat_prep(struct io_kiocb *req,
3856 const struct io_uring_sqe *sqe)
3857{
3858 struct io_hardlink *lnk = &req->hardlink;
3859 const char __user *oldf, *newf;
3860
3861 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3862 return -EINVAL;
3863 if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
3864 return -EINVAL;
3865 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3866 return -EBADF;
3867
3868 lnk->old_dfd = READ_ONCE(sqe->fd);
3869 lnk->new_dfd = READ_ONCE(sqe->len);
3870 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3871 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3872 lnk->flags = READ_ONCE(sqe->hardlink_flags);
3873
3874 lnk->oldpath = getname(oldf);
3875 if (IS_ERR(lnk->oldpath))
3876 return PTR_ERR(lnk->oldpath);
3877
3878 lnk->newpath = getname(newf);
3879 if (IS_ERR(lnk->newpath)) {
3880 putname(lnk->oldpath);
3881 return PTR_ERR(lnk->newpath);
3882 }
3883
3884 req->flags |= REQ_F_NEED_CLEANUP;
3885 return 0;
3886}
3887
04f34081 3888static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
cf30da90
DK
3889{
3890 struct io_hardlink *lnk = &req->hardlink;
3891 int ret;
3892
3893 if (issue_flags & IO_URING_F_NONBLOCK)
3894 return -EAGAIN;
3895
3896 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
3897 lnk->newpath, lnk->flags);
3898
3899 req->flags &= ~REQ_F_NEED_CLEANUP;
3900 if (ret < 0)
3901 req_set_fail(req);
3902 io_req_complete(req, ret);
3903 return 0;
3904}
3905
36f4fa68
JA
3906static int io_shutdown_prep(struct io_kiocb *req,
3907 const struct io_uring_sqe *sqe)
3908{
3909#if defined(CONFIG_NET)
3910 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3911 return -EINVAL;
26578cda
PB
3912 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3913 sqe->buf_index || sqe->splice_fd_in))
36f4fa68
JA
3914 return -EINVAL;
3915
3916 req->shutdown.how = READ_ONCE(sqe->len);
3917 return 0;
3918#else
3919 return -EOPNOTSUPP;
3920#endif
3921}
3922
45d189c6 3923static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
36f4fa68
JA
3924{
3925#if defined(CONFIG_NET)
3926 struct socket *sock;
3927 int ret;
3928
45d189c6 3929 if (issue_flags & IO_URING_F_NONBLOCK)
36f4fa68
JA
3930 return -EAGAIN;
3931
48aba79b 3932 sock = sock_from_file(req->file);
36f4fa68 3933 if (unlikely(!sock))
48aba79b 3934 return -ENOTSOCK;
36f4fa68
JA
3935
3936 ret = __sys_shutdown_sock(sock, req->shutdown.how);
a146468d 3937 if (ret < 0)
93d2bcd2 3938 req_set_fail(req);
36f4fa68
JA
3939 io_req_complete(req, ret);
3940 return 0;
3941#else
3942 return -EOPNOTSUPP;
3943#endif
3944}
3945
f2a8d5c7
PB
3946static int __io_splice_prep(struct io_kiocb *req,
3947 const struct io_uring_sqe *sqe)
7d67af2c 3948{
fe7e3257 3949 struct io_splice *sp = &req->splice;
7d67af2c 3950 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
7d67af2c 3951
3232dd02
PB
3952 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3953 return -EINVAL;
7d67af2c
PB
3954
3955 sp->file_in = NULL;
7d67af2c
PB
3956 sp->len = READ_ONCE(sqe->len);
3957 sp->flags = READ_ONCE(sqe->splice_flags);
3958
3959 if (unlikely(sp->flags & ~valid_flags))
3960 return -EINVAL;
3961
62906e89 3962 sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
8371adf5
PB
3963 (sp->flags & SPLICE_F_FD_IN_FIXED));
3964 if (!sp->file_in)
3965 return -EBADF;
7d67af2c 3966 req->flags |= REQ_F_NEED_CLEANUP;
7d67af2c
PB
3967 return 0;
3968}
3969
f2a8d5c7
PB
3970static int io_tee_prep(struct io_kiocb *req,
3971 const struct io_uring_sqe *sqe)
3972{
3973 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3974 return -EINVAL;
3975 return __io_splice_prep(req, sqe);
3976}
3977
45d189c6 3978static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
f2a8d5c7
PB
3979{
3980 struct io_splice *sp = &req->splice;
3981 struct file *in = sp->file_in;
3982 struct file *out = sp->file_out;
3983 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3984 long ret = 0;
3985
45d189c6 3986 if (issue_flags & IO_URING_F_NONBLOCK)
f2a8d5c7
PB
3987 return -EAGAIN;
3988 if (sp->len)
3989 ret = do_tee(in, out, sp->len, flags);
3990
e1d767f0
PB
3991 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3992 io_put_file(in);
f2a8d5c7
PB
3993 req->flags &= ~REQ_F_NEED_CLEANUP;
3994
f2a8d5c7 3995 if (ret != sp->len)
93d2bcd2 3996 req_set_fail(req);
e1e16097 3997 io_req_complete(req, ret);
f2a8d5c7
PB
3998 return 0;
3999}
4000
4001static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4002{
fe7e3257 4003 struct io_splice *sp = &req->splice;
f2a8d5c7
PB
4004
4005 sp->off_in = READ_ONCE(sqe->splice_off_in);
4006 sp->off_out = READ_ONCE(sqe->off);
4007 return __io_splice_prep(req, sqe);
4008}
4009
45d189c6 4010static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
7d67af2c
PB
4011{
4012 struct io_splice *sp = &req->splice;
4013 struct file *in = sp->file_in;
4014 struct file *out = sp->file_out;
4015 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4016 loff_t *poff_in, *poff_out;
c9687426 4017 long ret = 0;
7d67af2c 4018
45d189c6 4019 if (issue_flags & IO_URING_F_NONBLOCK)
2fb3e822 4020 return -EAGAIN;
7d67af2c
PB
4021
4022 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4023 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
c9687426 4024
948a7749 4025 if (sp->len)
c9687426 4026 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
7d67af2c 4027
e1d767f0
PB
4028 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4029 io_put_file(in);
7d67af2c
PB
4030 req->flags &= ~REQ_F_NEED_CLEANUP;
4031
7d67af2c 4032 if (ret != sp->len)
93d2bcd2 4033 req_set_fail(req);
e1e16097 4034 io_req_complete(req, ret);
7d67af2c
PB
4035 return 0;
4036}
4037
2b188cc1
JA
4038/*
4039 * IORING_OP_NOP just posts a completion event, nothing else.
4040 */
889fca73 4041static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
4042{
4043 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 4044
def596e9
JA
4045 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4046 return -EINVAL;
4047
889fca73 4048 __io_req_complete(req, issue_flags, 0, 0);
2b188cc1
JA
4049 return 0;
4050}
4051
1155c76a 4052static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 4053{
6b06314c 4054 struct io_ring_ctx *ctx = req->ctx;
c992fe29 4055
09bb8394
JA
4056 if (!req->file)
4057 return -EBADF;
c992fe29 4058
6b06314c 4059 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 4060 return -EINVAL;
26578cda
PB
4061 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4062 sqe->splice_fd_in))
c992fe29
CH
4063 return -EINVAL;
4064
8ed8d3c3
JA
4065 req->sync.flags = READ_ONCE(sqe->fsync_flags);
4066 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4067 return -EINVAL;
4068
4069 req->sync.off = READ_ONCE(sqe->off);
4070 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
4071 return 0;
4072}
4073
45d189c6 4074static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4075{
8ed8d3c3 4076 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
4077 int ret;
4078
ac45abc0 4079 /* fsync always requires a blocking context */
45d189c6 4080 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4081 return -EAGAIN;
4082
9adbd45d 4083 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
4084 end > 0 ? end : LLONG_MAX,
4085 req->sync.flags & IORING_FSYNC_DATASYNC);
4086 if (ret < 0)
93d2bcd2 4087 req_set_fail(req);
e1e16097 4088 io_req_complete(req, ret);
c992fe29
CH
4089 return 0;
4090}
4091
d63d1b5e
JA
4092static int io_fallocate_prep(struct io_kiocb *req,
4093 const struct io_uring_sqe *sqe)
4094{
26578cda
PB
4095 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4096 sqe->splice_fd_in)
d63d1b5e 4097 return -EINVAL;
3232dd02
PB
4098 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4099 return -EINVAL;
d63d1b5e
JA
4100
4101 req->sync.off = READ_ONCE(sqe->off);
4102 req->sync.len = READ_ONCE(sqe->addr);
4103 req->sync.mode = READ_ONCE(sqe->len);
4104 return 0;
4105}
4106
45d189c6 4107static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
5d17b4a4 4108{
ac45abc0
PB
4109 int ret;
4110
d63d1b5e 4111 /* fallocate always requiring blocking context */
45d189c6 4112 if (issue_flags & IO_URING_F_NONBLOCK)
5d17b4a4 4113 return -EAGAIN;
ac45abc0
PB
4114 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4115 req->sync.len);
ac45abc0 4116 if (ret < 0)
93d2bcd2 4117 req_set_fail(req);
e1e16097 4118 io_req_complete(req, ret);
5d17b4a4
JA
4119 return 0;
4120}
4121
ec65fea5 4122static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 4123{
f8748881 4124 const char __user *fname;
15b71abe 4125 int ret;
b7bb4f7d 4126
d3fddf6d
PB
4127 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4128 return -EINVAL;
b9445598 4129 if (unlikely(sqe->ioprio || sqe->buf_index))
15b71abe 4130 return -EINVAL;
ec65fea5 4131 if (unlikely(req->flags & REQ_F_FIXED_FILE))
cf3040ca 4132 return -EBADF;
03b1230c 4133
ec65fea5
PB
4134 /* open.how should be already initialised */
4135 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
08a1d26e 4136 req->open.how.flags |= O_LARGEFILE;
3529d8c2 4137
25e72d10
PB
4138 req->open.dfd = READ_ONCE(sqe->fd);
4139 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
f8748881 4140 req->open.filename = getname(fname);
15b71abe
JA
4141 if (IS_ERR(req->open.filename)) {
4142 ret = PTR_ERR(req->open.filename);
4143 req->open.filename = NULL;
4144 return ret;
4145 }
b9445598
PB
4146
4147 req->open.file_slot = READ_ONCE(sqe->file_index);
4148 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4149 return -EINVAL;
4150
4022e7af 4151 req->open.nofile = rlimit(RLIMIT_NOFILE);
8fef80bf 4152 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 4153 return 0;
03b1230c
JA
4154}
4155
ec65fea5
PB
4156static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4157{
d3fddf6d
PB
4158 u64 mode = READ_ONCE(sqe->len);
4159 u64 flags = READ_ONCE(sqe->open_flags);
ec65fea5 4160
ec65fea5
PB
4161 req->open.how = build_open_how(flags, mode);
4162 return __io_openat_prep(req, sqe);
4163}
4164
cebdb986 4165static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 4166{
cebdb986 4167 struct open_how __user *how;
cebdb986 4168 size_t len;
0fa03c62
JA
4169 int ret;
4170
cebdb986
JA
4171 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4172 len = READ_ONCE(sqe->len);
cebdb986
JA
4173 if (len < OPEN_HOW_SIZE_VER0)
4174 return -EINVAL;
3529d8c2 4175
cebdb986
JA
4176 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4177 len);
4178 if (ret)
4179 return ret;
3529d8c2 4180
ec65fea5 4181 return __io_openat_prep(req, sqe);
cebdb986
JA
4182}
4183
45d189c6 4184static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
15b71abe
JA
4185{
4186 struct open_flags op;
15b71abe 4187 struct file *file;
b9445598
PB
4188 bool resolve_nonblock, nonblock_set;
4189 bool fixed = !!req->open.file_slot;
15b71abe
JA
4190 int ret;
4191
cebdb986 4192 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
4193 if (ret)
4194 goto err;
3a81fd02
JA
4195 nonblock_set = op.open_flag & O_NONBLOCK;
4196 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
45d189c6 4197 if (issue_flags & IO_URING_F_NONBLOCK) {
3a81fd02
JA
4198 /*
4199 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4200 * it'll always -EAGAIN
4201 */
4202 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4203 return -EAGAIN;
4204 op.lookup_flags |= LOOKUP_CACHED;
4205 op.open_flag |= O_NONBLOCK;
4206 }
15b71abe 4207
b9445598
PB
4208 if (!fixed) {
4209 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4210 if (ret < 0)
4211 goto err;
4212 }
15b71abe
JA
4213
4214 file = do_filp_open(req->open.dfd, req->open.filename, &op);
12dcb58a 4215 if (IS_ERR(file)) {
944d1444 4216 /*
12dcb58a
PB
4217 * We could hang on to this 'fd' on retrying, but seems like
4218 * marginal gain for something that is now known to be a slower
4219 * path. So just put it, and we'll get a new one when we retry.
944d1444 4220 */
b9445598
PB
4221 if (!fixed)
4222 put_unused_fd(ret);
3a81fd02 4223
15b71abe 4224 ret = PTR_ERR(file);
12dcb58a
PB
4225 /* only retry if RESOLVE_CACHED wasn't already set by application */
4226 if (ret == -EAGAIN &&
4227 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4228 return -EAGAIN;
4229 goto err;
15b71abe 4230 }
12dcb58a
PB
4231
4232 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4233 file->f_flags &= ~O_NONBLOCK;
4234 fsnotify_open(file);
b9445598
PB
4235
4236 if (!fixed)
4237 fd_install(ret, file);
4238 else
4239 ret = io_install_fixed_file(req, file, issue_flags,
4240 req->open.file_slot - 1);
15b71abe
JA
4241err:
4242 putname(req->open.filename);
8fef80bf 4243 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe 4244 if (ret < 0)
93d2bcd2 4245 req_set_fail(req);
0bdf3398 4246 __io_req_complete(req, issue_flags, ret, 0);
15b71abe
JA
4247 return 0;
4248}
4249
45d189c6 4250static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
cebdb986 4251{
e45cff58 4252 return io_openat2(req, issue_flags);
cebdb986
JA
4253}
4254
067524e9
JA
4255static int io_remove_buffers_prep(struct io_kiocb *req,
4256 const struct io_uring_sqe *sqe)
4257{
4258 struct io_provide_buf *p = &req->pbuf;
4259 u64 tmp;
4260
26578cda
PB
4261 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4262 sqe->splice_fd_in)
067524e9
JA
4263 return -EINVAL;
4264
4265 tmp = READ_ONCE(sqe->fd);
4266 if (!tmp || tmp > USHRT_MAX)
4267 return -EINVAL;
4268
4269 memset(p, 0, sizeof(*p));
4270 p->nbufs = tmp;
4271 p->bgid = READ_ONCE(sqe->buf_group);
4272 return 0;
4273}
4274
4275static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4276 int bgid, unsigned nbufs)
4277{
4278 unsigned i = 0;
4279
4280 /* shouldn't happen */
4281 if (!nbufs)
4282 return 0;
4283
4284 /* the head kbuf is the list itself */
4285 while (!list_empty(&buf->list)) {
4286 struct io_buffer *nxt;
4287
4288 nxt = list_first_entry(&buf->list, struct io_buffer, list);
4289 list_del(&nxt->list);
4290 kfree(nxt);
4291 if (++i == nbufs)
4292 return i;
4293 }
4294 i++;
4295 kfree(buf);
9e15c3a0 4296 xa_erase(&ctx->io_buffers, bgid);
067524e9
JA
4297
4298 return i;
4299}
4300
889fca73 4301static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
067524e9
JA
4302{
4303 struct io_provide_buf *p = &req->pbuf;
4304 struct io_ring_ctx *ctx = req->ctx;
4305 struct io_buffer *head;
4306 int ret = 0;
45d189c6 4307 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
067524e9
JA
4308
4309 io_ring_submit_lock(ctx, !force_nonblock);
4310
4311 lockdep_assert_held(&ctx->uring_lock);
4312
4313 ret = -ENOENT;
9e15c3a0 4314 head = xa_load(&ctx->io_buffers, p->bgid);
067524e9
JA
4315 if (head)
4316 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
067524e9 4317 if (ret < 0)
93d2bcd2 4318 req_set_fail(req);
067524e9 4319
9fb8cb49
PB
4320 /* complete before unlock, IOPOLL may need the lock */
4321 __io_req_complete(req, issue_flags, ret, 0);
4322 io_ring_submit_unlock(ctx, !force_nonblock);
067524e9
JA
4323 return 0;
4324}
4325
ddf0322d
JA
4326static int io_provide_buffers_prep(struct io_kiocb *req,
4327 const struct io_uring_sqe *sqe)
4328{
38134ada 4329 unsigned long size, tmp_check;
ddf0322d
JA
4330 struct io_provide_buf *p = &req->pbuf;
4331 u64 tmp;
4332
26578cda 4333 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
ddf0322d
JA
4334 return -EINVAL;
4335
4336 tmp = READ_ONCE(sqe->fd);
4337 if (!tmp || tmp > USHRT_MAX)
4338 return -E2BIG;
4339 p->nbufs = tmp;
4340 p->addr = READ_ONCE(sqe->addr);
4341 p->len = READ_ONCE(sqe->len);
4342
38134ada
PB
4343 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4344 &size))
4345 return -EOVERFLOW;
4346 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4347 return -EOVERFLOW;
4348
d81269fe
PB
4349 size = (unsigned long)p->len * p->nbufs;
4350 if (!access_ok(u64_to_user_ptr(p->addr), size))
ddf0322d
JA
4351 return -EFAULT;
4352
4353 p->bgid = READ_ONCE(sqe->buf_group);
4354 tmp = READ_ONCE(sqe->off);
4355 if (tmp > USHRT_MAX)
4356 return -E2BIG;
4357 p->bid = tmp;
4358 return 0;
4359}
4360
4361static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4362{
4363 struct io_buffer *buf;
4364 u64 addr = pbuf->addr;
4365 int i, bid = pbuf->bid;
4366
4367 for (i = 0; i < pbuf->nbufs; i++) {
9990da93 4368 buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
ddf0322d
JA
4369 if (!buf)
4370 break;
4371
4372 buf->addr = addr;
d1f82808 4373 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
ddf0322d
JA
4374 buf->bid = bid;
4375 addr += pbuf->len;
4376 bid++;
4377 if (!*head) {
4378 INIT_LIST_HEAD(&buf->list);
4379 *head = buf;
4380 } else {
4381 list_add_tail(&buf->list, &(*head)->list);
4382 }
4383 }
4384
4385 return i ? i : -ENOMEM;
4386}
4387
889fca73 4388static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
ddf0322d
JA
4389{
4390 struct io_provide_buf *p = &req->pbuf;
4391 struct io_ring_ctx *ctx = req->ctx;
4392 struct io_buffer *head, *list;
4393 int ret = 0;
45d189c6 4394 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ddf0322d
JA
4395
4396 io_ring_submit_lock(ctx, !force_nonblock);
4397
4398 lockdep_assert_held(&ctx->uring_lock);
4399
9e15c3a0 4400 list = head = xa_load(&ctx->io_buffers, p->bgid);
ddf0322d
JA
4401
4402 ret = io_add_buffers(p, &head);
9e15c3a0
JA
4403 if (ret >= 0 && !list) {
4404 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4405 if (ret < 0)
067524e9 4406 __io_remove_buffers(ctx, head, p->bgid, -1U);
ddf0322d 4407 }
ddf0322d 4408 if (ret < 0)
93d2bcd2 4409 req_set_fail(req);
9fb8cb49
PB
4410 /* complete before unlock, IOPOLL may need the lock */
4411 __io_req_complete(req, issue_flags, ret, 0);
4412 io_ring_submit_unlock(ctx, !force_nonblock);
ddf0322d 4413 return 0;
cebdb986
JA
4414}
4415
3e4827b0
JA
4416static int io_epoll_ctl_prep(struct io_kiocb *req,
4417 const struct io_uring_sqe *sqe)
4418{
4419#if defined(CONFIG_EPOLL)
26578cda 4420 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3e4827b0 4421 return -EINVAL;
2d74d042 4422 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4423 return -EINVAL;
3e4827b0
JA
4424
4425 req->epoll.epfd = READ_ONCE(sqe->fd);
4426 req->epoll.op = READ_ONCE(sqe->len);
4427 req->epoll.fd = READ_ONCE(sqe->off);
4428
4429 if (ep_op_has_event(req->epoll.op)) {
4430 struct epoll_event __user *ev;
4431
4432 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4433 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4434 return -EFAULT;
4435 }
4436
4437 return 0;
4438#else
4439 return -EOPNOTSUPP;
4440#endif
4441}
4442
889fca73 4443static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
3e4827b0
JA
4444{
4445#if defined(CONFIG_EPOLL)
4446 struct io_epoll *ie = &req->epoll;
4447 int ret;
45d189c6 4448 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3e4827b0
JA
4449
4450 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4451 if (force_nonblock && ret == -EAGAIN)
4452 return -EAGAIN;
4453
4454 if (ret < 0)
93d2bcd2 4455 req_set_fail(req);
889fca73 4456 __io_req_complete(req, issue_flags, ret, 0);
3e4827b0
JA
4457 return 0;
4458#else
4459 return -EOPNOTSUPP;
4460#endif
4461}
4462
c1ca757b
JA
4463static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4464{
4465#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
26578cda 4466 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
c1ca757b 4467 return -EINVAL;
3232dd02
PB
4468 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4469 return -EINVAL;
c1ca757b
JA
4470
4471 req->madvise.addr = READ_ONCE(sqe->addr);
4472 req->madvise.len = READ_ONCE(sqe->len);
4473 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4474 return 0;
4475#else
4476 return -EOPNOTSUPP;
4477#endif
4478}
4479
45d189c6 4480static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
c1ca757b
JA
4481{
4482#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4483 struct io_madvise *ma = &req->madvise;
4484 int ret;
4485
45d189c6 4486 if (issue_flags & IO_URING_F_NONBLOCK)
c1ca757b
JA
4487 return -EAGAIN;
4488
0726b01e 4489 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
c1ca757b 4490 if (ret < 0)
93d2bcd2 4491 req_set_fail(req);
e1e16097 4492 io_req_complete(req, ret);
c1ca757b
JA
4493 return 0;
4494#else
4495 return -EOPNOTSUPP;
4496#endif
4497}
4498
4840e418
JA
4499static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4500{
26578cda 4501 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4840e418 4502 return -EINVAL;
3232dd02
PB
4503 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4504 return -EINVAL;
4840e418
JA
4505
4506 req->fadvise.offset = READ_ONCE(sqe->off);
4507 req->fadvise.len = READ_ONCE(sqe->len);
4508 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4509 return 0;
4510}
4511
45d189c6 4512static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4840e418
JA
4513{
4514 struct io_fadvise *fa = &req->fadvise;
4515 int ret;
4516
45d189c6 4517 if (issue_flags & IO_URING_F_NONBLOCK) {
3e69426d
JA
4518 switch (fa->advice) {
4519 case POSIX_FADV_NORMAL:
4520 case POSIX_FADV_RANDOM:
4521 case POSIX_FADV_SEQUENTIAL:
4522 break;
4523 default:
4524 return -EAGAIN;
4525 }
4526 }
4840e418
JA
4527
4528 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4529 if (ret < 0)
93d2bcd2 4530 req_set_fail(req);
0bdf3398 4531 __io_req_complete(req, issue_flags, ret, 0);
4840e418
JA
4532 return 0;
4533}
4534
eddc7ef5
JA
4535static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4536{
2d74d042 4537 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4538 return -EINVAL;
26578cda 4539 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
eddc7ef5 4540 return -EINVAL;
9c280f90 4541 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4542 return -EBADF;
eddc7ef5 4543
1d9e1288
BM
4544 req->statx.dfd = READ_ONCE(sqe->fd);
4545 req->statx.mask = READ_ONCE(sqe->len);
e62753e4 4546 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
1d9e1288
BM
4547 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4548 req->statx.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5
JA
4549
4550 return 0;
4551}
4552
45d189c6 4553static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
eddc7ef5 4554{
1d9e1288 4555 struct io_statx *ctx = &req->statx;
eddc7ef5
JA
4556 int ret;
4557
59d70013 4558 if (issue_flags & IO_URING_F_NONBLOCK)
eddc7ef5
JA
4559 return -EAGAIN;
4560
e62753e4
BM
4561 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4562 ctx->buffer);
eddc7ef5 4563
eddc7ef5 4564 if (ret < 0)
93d2bcd2 4565 req_set_fail(req);
e1e16097 4566 io_req_complete(req, ret);
eddc7ef5
JA
4567 return 0;
4568}
4569
b5dba59e
JA
4570static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4571{
14587a46 4572 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4573 return -EINVAL;
b5dba59e 4574 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
7df778be 4575 sqe->rw_flags || sqe->buf_index)
b5dba59e 4576 return -EINVAL;
9c280f90 4577 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 4578 return -EBADF;
b5dba59e
JA
4579
4580 req->close.fd = READ_ONCE(sqe->fd);
7df778be
PB
4581 req->close.file_slot = READ_ONCE(sqe->file_index);
4582 if (req->close.file_slot && req->close.fd)
4583 return -EINVAL;
4584
b5dba59e 4585 return 0;
b5dba59e
JA
4586}
4587
889fca73 4588static int io_close(struct io_kiocb *req, unsigned int issue_flags)
b5dba59e 4589{
9eac1904 4590 struct files_struct *files = current->files;
3af73b28 4591 struct io_close *close = &req->close;
9eac1904 4592 struct fdtable *fdt;
a1fde923
PB
4593 struct file *file = NULL;
4594 int ret = -EBADF;
b5dba59e 4595
7df778be
PB
4596 if (req->close.file_slot) {
4597 ret = io_close_fixed(req, issue_flags);
4598 goto err;
4599 }
4600
9eac1904
JA
4601 spin_lock(&files->file_lock);
4602 fdt = files_fdtable(files);
4603 if (close->fd >= fdt->max_fds) {
4604 spin_unlock(&files->file_lock);
4605 goto err;
4606 }
4607 file = fdt->fd[close->fd];
a1fde923 4608 if (!file || file->f_op == &io_uring_fops) {
9eac1904
JA
4609 spin_unlock(&files->file_lock);
4610 file = NULL;
4611 goto err;
3af73b28 4612 }
b5dba59e
JA
4613
4614 /* if the file has a flush method, be safe and punt to async */
45d189c6 4615 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
9eac1904 4616 spin_unlock(&files->file_lock);
0bf0eefd 4617 return -EAGAIN;
a2100672 4618 }
b5dba59e 4619
9eac1904
JA
4620 ret = __close_fd_get_file(close->fd, &file);
4621 spin_unlock(&files->file_lock);
4622 if (ret < 0) {
4623 if (ret == -ENOENT)
4624 ret = -EBADF;
4625 goto err;
4626 }
4627
3af73b28 4628 /* No ->flush() or already async, safely close from here */
9eac1904
JA
4629 ret = filp_close(file, current->files);
4630err:
3af73b28 4631 if (ret < 0)
93d2bcd2 4632 req_set_fail(req);
9eac1904
JA
4633 if (file)
4634 fput(file);
889fca73 4635 __io_req_complete(req, issue_flags, ret, 0);
1a417f4e 4636 return 0;
b5dba59e
JA
4637}
4638
1155c76a 4639static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
4640{
4641 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4 4642
5d17b4a4
JA
4643 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4644 return -EINVAL;
26578cda
PB
4645 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4646 sqe->splice_fd_in))
5d17b4a4
JA
4647 return -EINVAL;
4648
8ed8d3c3
JA
4649 req->sync.off = READ_ONCE(sqe->off);
4650 req->sync.len = READ_ONCE(sqe->len);
4651 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
4652 return 0;
4653}
4654
45d189c6 4655static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4656{
8ed8d3c3
JA
4657 int ret;
4658
ac45abc0 4659 /* sync_file_range always requires a blocking context */
45d189c6 4660 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4661 return -EAGAIN;
4662
9adbd45d 4663 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
4664 req->sync.flags);
4665 if (ret < 0)
93d2bcd2 4666 req_set_fail(req);
e1e16097 4667 io_req_complete(req, ret);
5d17b4a4
JA
4668 return 0;
4669}
4670
469956e8 4671#if defined(CONFIG_NET)
02d27d89
PB
4672static int io_setup_async_msg(struct io_kiocb *req,
4673 struct io_async_msghdr *kmsg)
4674{
e8c2bc1f
JA
4675 struct io_async_msghdr *async_msg = req->async_data;
4676
4677 if (async_msg)
02d27d89 4678 return -EAGAIN;
e8c2bc1f 4679 if (io_alloc_async_data(req)) {
257e84a5 4680 kfree(kmsg->free_iov);
02d27d89
PB
4681 return -ENOMEM;
4682 }
e8c2bc1f 4683 async_msg = req->async_data;
02d27d89 4684 req->flags |= REQ_F_NEED_CLEANUP;
e8c2bc1f 4685 memcpy(async_msg, kmsg, sizeof(*kmsg));
2a780802 4686 async_msg->msg.msg_name = &async_msg->addr;
257e84a5
PB
4687 /* if were using fast_iov, set it to the new one */
4688 if (!async_msg->free_iov)
4689 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4690
02d27d89
PB
4691 return -EAGAIN;
4692}
4693
2ae523ed
PB
4694static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4695 struct io_async_msghdr *iomsg)
4696{
2ae523ed 4697 iomsg->msg.msg_name = &iomsg->addr;
257e84a5 4698 iomsg->free_iov = iomsg->fast_iov;
2ae523ed 4699 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
257e84a5 4700 req->sr_msg.msg_flags, &iomsg->free_iov);
2ae523ed
PB
4701}
4702
93642ef8
PB
4703static int io_sendmsg_prep_async(struct io_kiocb *req)
4704{
4705 int ret;
4706
93642ef8
PB
4707 ret = io_sendmsg_copy_hdr(req, req->async_data);
4708 if (!ret)
4709 req->flags |= REQ_F_NEED_CLEANUP;
4710 return ret;
4711}
4712
3529d8c2 4713static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 4714{
e47293fd 4715 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 4716
d2b6f48b
PB
4717 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4718 return -EINVAL;
4719
270a5940 4720 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 4721 sr->len = READ_ONCE(sqe->len);
04411806
PB
4722 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4723 if (sr->msg_flags & MSG_DONTWAIT)
4724 req->flags |= REQ_F_NOWAIT;
3529d8c2 4725
d8768362
JA
4726#ifdef CONFIG_COMPAT
4727 if (req->ctx->compat)
4728 sr->msg_flags |= MSG_CMSG_COMPAT;
4729#endif
93642ef8 4730 return 0;
03b1230c
JA
4731}
4732
889fca73 4733static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4734{
6b754c8b 4735 struct io_async_msghdr iomsg, *kmsg;
0fa03c62 4736 struct socket *sock;
7a7cacba 4737 unsigned flags;
0031275d 4738 int min_ret = 0;
0fa03c62
JA
4739 int ret;
4740
dba4a925 4741 sock = sock_from_file(req->file);
7a7cacba 4742 if (unlikely(!sock))
dba4a925 4743 return -ENOTSOCK;
3529d8c2 4744
d886e185
PB
4745 if (req_has_async_data(req)) {
4746 kmsg = req->async_data;
4747 } else {
7a7cacba
PB
4748 ret = io_sendmsg_copy_hdr(req, &iomsg);
4749 if (ret)
4750 return ret;
4751 kmsg = &iomsg;
0fa03c62 4752 }
0fa03c62 4753
04411806
PB
4754 flags = req->sr_msg.msg_flags;
4755 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4756 flags |= MSG_DONTWAIT;
0031275d
SM
4757 if (flags & MSG_WAITALL)
4758 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4759
7a7cacba 4760 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
45d189c6 4761 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4762 return io_setup_async_msg(req, kmsg);
4763 if (ret == -ERESTARTSYS)
4764 ret = -EINTR;
0fa03c62 4765
257e84a5
PB
4766 /* fast path, check for non-NULL to avoid function call */
4767 if (kmsg->free_iov)
4768 kfree(kmsg->free_iov);
99bc4c38 4769 req->flags &= ~REQ_F_NEED_CLEANUP;
0031275d 4770 if (ret < min_ret)
93d2bcd2 4771 req_set_fail(req);
889fca73 4772 __io_req_complete(req, issue_flags, ret, 0);
5d17b4a4 4773 return 0;
03b1230c 4774}
aa1fa28f 4775
889fca73 4776static int io_send(struct io_kiocb *req, unsigned int issue_flags)
fddaface 4777{
7a7cacba
PB
4778 struct io_sr_msg *sr = &req->sr_msg;
4779 struct msghdr msg;
4780 struct iovec iov;
fddaface 4781 struct socket *sock;
7a7cacba 4782 unsigned flags;
0031275d 4783 int min_ret = 0;
fddaface
JA
4784 int ret;
4785
dba4a925 4786 sock = sock_from_file(req->file);
7a7cacba 4787 if (unlikely(!sock))
dba4a925 4788 return -ENOTSOCK;
fddaface 4789
7a7cacba
PB
4790 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4791 if (unlikely(ret))
14db8411 4792 return ret;
fddaface 4793
7a7cacba
PB
4794 msg.msg_name = NULL;
4795 msg.msg_control = NULL;
4796 msg.msg_controllen = 0;
4797 msg.msg_namelen = 0;
fddaface 4798
04411806
PB
4799 flags = req->sr_msg.msg_flags;
4800 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 4801 flags |= MSG_DONTWAIT;
0031275d
SM
4802 if (flags & MSG_WAITALL)
4803 min_ret = iov_iter_count(&msg.msg_iter);
4804
7a7cacba
PB
4805 msg.msg_flags = flags;
4806 ret = sock_sendmsg(sock, &msg);
45d189c6 4807 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
7a7cacba
PB
4808 return -EAGAIN;
4809 if (ret == -ERESTARTSYS)
4810 ret = -EINTR;
fddaface 4811
0031275d 4812 if (ret < min_ret)
93d2bcd2 4813 req_set_fail(req);
889fca73 4814 __io_req_complete(req, issue_flags, ret, 0);
fddaface 4815 return 0;
fddaface
JA
4816}
4817
1400e697
PB
4818static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4819 struct io_async_msghdr *iomsg)
52de1fe1
JA
4820{
4821 struct io_sr_msg *sr = &req->sr_msg;
4822 struct iovec __user *uiov;
4823 size_t iov_len;
4824 int ret;
4825
1400e697
PB
4826 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4827 &iomsg->uaddr, &uiov, &iov_len);
52de1fe1
JA
4828 if (ret)
4829 return ret;
4830
4831 if (req->flags & REQ_F_BUFFER_SELECT) {
4832 if (iov_len > 1)
4833 return -EINVAL;
5476dfed 4834 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
52de1fe1 4835 return -EFAULT;
5476dfed 4836 sr->len = iomsg->fast_iov[0].iov_len;
257e84a5 4837 iomsg->free_iov = NULL;
52de1fe1 4838 } else {
257e84a5 4839 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4840 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
257e84a5 4841 &iomsg->free_iov, &iomsg->msg.msg_iter,
89cd35c5 4842 false);
52de1fe1
JA
4843 if (ret > 0)
4844 ret = 0;
4845 }
4846
4847 return ret;
4848}
4849
4850#ifdef CONFIG_COMPAT
4851static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
1400e697 4852 struct io_async_msghdr *iomsg)
52de1fe1 4853{
52de1fe1
JA
4854 struct io_sr_msg *sr = &req->sr_msg;
4855 struct compat_iovec __user *uiov;
4856 compat_uptr_t ptr;
4857 compat_size_t len;
4858 int ret;
4859
4af3417a
PB
4860 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4861 &ptr, &len);
52de1fe1
JA
4862 if (ret)
4863 return ret;
4864
4865 uiov = compat_ptr(ptr);
4866 if (req->flags & REQ_F_BUFFER_SELECT) {
4867 compat_ssize_t clen;
4868
4869 if (len > 1)
4870 return -EINVAL;
4871 if (!access_ok(uiov, sizeof(*uiov)))
4872 return -EFAULT;
4873 if (__get_user(clen, &uiov->iov_len))
4874 return -EFAULT;
4875 if (clen < 0)
4876 return -EINVAL;
2d280bc8 4877 sr->len = clen;
257e84a5 4878 iomsg->free_iov = NULL;
52de1fe1 4879 } else {
257e84a5 4880 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 4881 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
257e84a5 4882 UIO_FASTIOV, &iomsg->free_iov,
89cd35c5 4883 &iomsg->msg.msg_iter, true);
52de1fe1
JA
4884 if (ret < 0)
4885 return ret;
4886 }
4887
4888 return 0;
4889}
4890#endif
4891
1400e697
PB
4892static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4893 struct io_async_msghdr *iomsg)
52de1fe1 4894{
1400e697 4895 iomsg->msg.msg_name = &iomsg->addr;
52de1fe1
JA
4896
4897#ifdef CONFIG_COMPAT
4898 if (req->ctx->compat)
1400e697 4899 return __io_compat_recvmsg_copy_hdr(req, iomsg);
fddaface 4900#endif
52de1fe1 4901
1400e697 4902 return __io_recvmsg_copy_hdr(req, iomsg);
52de1fe1
JA
4903}
4904
bcda7baa 4905static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
7fbb1b54 4906 bool needs_lock)
bcda7baa
JA
4907{
4908 struct io_sr_msg *sr = &req->sr_msg;
bcda7baa 4909
30d51dd4 4910 return io_buffer_select(req, &sr->len, sr->bgid, needs_lock);
fddaface
JA
4911}
4912
7fbb1b54
PB
4913static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4914{
30d51dd4 4915 return io_put_kbuf(req, req->kbuf);
7fbb1b54
PB
4916}
4917
93642ef8 4918static int io_recvmsg_prep_async(struct io_kiocb *req)
aa1fa28f 4919{
99bc4c38 4920 int ret;
3529d8c2 4921
93642ef8
PB
4922 ret = io_recvmsg_copy_hdr(req, req->async_data);
4923 if (!ret)
4924 req->flags |= REQ_F_NEED_CLEANUP;
4925 return ret;
4926}
4927
4928static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4929{
4930 struct io_sr_msg *sr = &req->sr_msg;
4931
d2b6f48b
PB
4932 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4933 return -EINVAL;
4934
270a5940 4935 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 4936 sr->len = READ_ONCE(sqe->len);
bcda7baa 4937 sr->bgid = READ_ONCE(sqe->buf_group);
04411806
PB
4938 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4939 if (sr->msg_flags & MSG_DONTWAIT)
4940 req->flags |= REQ_F_NOWAIT;
06b76d44 4941
d8768362
JA
4942#ifdef CONFIG_COMPAT
4943 if (req->ctx->compat)
4944 sr->msg_flags |= MSG_CMSG_COMPAT;
4945#endif
93642ef8 4946 return 0;
aa1fa28f
JA
4947}
4948
889fca73 4949static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 4950{
6b754c8b 4951 struct io_async_msghdr iomsg, *kmsg;
03b1230c 4952 struct socket *sock;
7fbb1b54 4953 struct io_buffer *kbuf;
7a7cacba 4954 unsigned flags;
0031275d 4955 int min_ret = 0;
52de1fe1 4956 int ret, cflags = 0;
45d189c6 4957 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
03b1230c 4958
dba4a925 4959 sock = sock_from_file(req->file);
7a7cacba 4960 if (unlikely(!sock))
dba4a925 4961 return -ENOTSOCK;
3529d8c2 4962
d886e185
PB
4963 if (req_has_async_data(req)) {
4964 kmsg = req->async_data;
4965 } else {
7a7cacba
PB
4966 ret = io_recvmsg_copy_hdr(req, &iomsg);
4967 if (ret)
681fda8d 4968 return ret;
7a7cacba
PB
4969 kmsg = &iomsg;
4970 }
03b1230c 4971
bc02ef33 4972 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 4973 kbuf = io_recv_buffer_select(req, !force_nonblock);
bc02ef33 4974 if (IS_ERR(kbuf))
52de1fe1 4975 return PTR_ERR(kbuf);
7a7cacba 4976 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5476dfed
PB
4977 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4978 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
7a7cacba
PB
4979 1, req->sr_msg.len);
4980 }
52de1fe1 4981
04411806
PB
4982 flags = req->sr_msg.msg_flags;
4983 if (force_nonblock)
7a7cacba 4984 flags |= MSG_DONTWAIT;
0031275d
SM
4985 if (flags & MSG_WAITALL)
4986 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4987
7a7cacba
PB
4988 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4989 kmsg->uaddr, flags);
0e1b6fe3
PB
4990 if (force_nonblock && ret == -EAGAIN)
4991 return io_setup_async_msg(req, kmsg);
7a7cacba
PB
4992 if (ret == -ERESTARTSYS)
4993 ret = -EINTR;
03b1230c 4994
7fbb1b54
PB
4995 if (req->flags & REQ_F_BUFFER_SELECTED)
4996 cflags = io_put_recv_kbuf(req);
257e84a5
PB
4997 /* fast path, check for non-NULL to avoid function call */
4998 if (kmsg->free_iov)
4999 kfree(kmsg->free_iov);
99bc4c38 5000 req->flags &= ~REQ_F_NEED_CLEANUP;
0031275d 5001 if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
93d2bcd2 5002 req_set_fail(req);
889fca73 5003 __io_req_complete(req, issue_flags, ret, cflags);
03b1230c 5004 return 0;
0fa03c62 5005}
5d17b4a4 5006
889fca73 5007static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
fddaface 5008{
6b754c8b 5009 struct io_buffer *kbuf;
7a7cacba
PB
5010 struct io_sr_msg *sr = &req->sr_msg;
5011 struct msghdr msg;
5012 void __user *buf = sr->buf;
fddaface 5013 struct socket *sock;
7a7cacba
PB
5014 struct iovec iov;
5015 unsigned flags;
0031275d 5016 int min_ret = 0;
bcda7baa 5017 int ret, cflags = 0;
45d189c6 5018 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
fddaface 5019
dba4a925 5020 sock = sock_from_file(req->file);
7a7cacba 5021 if (unlikely(!sock))
dba4a925 5022 return -ENOTSOCK;
fddaface 5023
bc02ef33 5024 if (req->flags & REQ_F_BUFFER_SELECT) {
7fbb1b54 5025 kbuf = io_recv_buffer_select(req, !force_nonblock);
bcda7baa
JA
5026 if (IS_ERR(kbuf))
5027 return PTR_ERR(kbuf);
7a7cacba 5028 buf = u64_to_user_ptr(kbuf->addr);
bc02ef33 5029 }
bcda7baa 5030
7a7cacba 5031 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
14c32eee
PB
5032 if (unlikely(ret))
5033 goto out_free;
fddaface 5034
7a7cacba
PB
5035 msg.msg_name = NULL;
5036 msg.msg_control = NULL;
5037 msg.msg_controllen = 0;
5038 msg.msg_namelen = 0;
5039 msg.msg_iocb = NULL;
5040 msg.msg_flags = 0;
fddaface 5041
04411806
PB
5042 flags = req->sr_msg.msg_flags;
5043 if (force_nonblock)
7a7cacba 5044 flags |= MSG_DONTWAIT;
0031275d
SM
5045 if (flags & MSG_WAITALL)
5046 min_ret = iov_iter_count(&msg.msg_iter);
5047
7a7cacba
PB
5048 ret = sock_recvmsg(sock, &msg, flags);
5049 if (force_nonblock && ret == -EAGAIN)
5050 return -EAGAIN;
5051 if (ret == -ERESTARTSYS)
5052 ret = -EINTR;
14c32eee 5053out_free:
7fbb1b54
PB
5054 if (req->flags & REQ_F_BUFFER_SELECTED)
5055 cflags = io_put_recv_kbuf(req);
0031275d 5056 if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
93d2bcd2 5057 req_set_fail(req);
889fca73 5058 __io_req_complete(req, issue_flags, ret, cflags);
fddaface 5059 return 0;
fddaface
JA
5060}
5061
3529d8c2 5062static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35 5063{
8ed8d3c3
JA
5064 struct io_accept *accept = &req->accept;
5065
14587a46 5066 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
17f2fe35 5067 return -EINVAL;
aaa4db12 5068 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
5069 return -EINVAL;
5070
d55e5f5b
JA
5071 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5072 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 5073 accept->flags = READ_ONCE(sqe->accept_flags);
09952e3e 5074 accept->nofile = rlimit(RLIMIT_NOFILE);
a7083ad5 5075
aaa4db12
PB
5076 accept->file_slot = READ_ONCE(sqe->file_index);
5077 if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
5078 (accept->flags & SOCK_CLOEXEC)))
5079 return -EINVAL;
a7083ad5
PB
5080 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5081 return -EINVAL;
5082 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5083 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
8ed8d3c3 5084 return 0;
8ed8d3c3 5085}
17f2fe35 5086
889fca73 5087static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3
JA
5088{
5089 struct io_accept *accept = &req->accept;
45d189c6 5090 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ac45abc0 5091 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
aaa4db12 5092 bool fixed = !!accept->file_slot;
a7083ad5
PB
5093 struct file *file;
5094 int ret, fd;
8ed8d3c3 5095
e697deed
JX
5096 if (req->file->f_flags & O_NONBLOCK)
5097 req->flags |= REQ_F_NOWAIT;
5098
aaa4db12
PB
5099 if (!fixed) {
5100 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5101 if (unlikely(fd < 0))
5102 return fd;
5103 }
a7083ad5
PB
5104 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5105 accept->flags);
5106 if (IS_ERR(file)) {
aaa4db12
PB
5107 if (!fixed)
5108 put_unused_fd(fd);
a7083ad5
PB
5109 ret = PTR_ERR(file);
5110 if (ret == -EAGAIN && force_nonblock)
5111 return -EAGAIN;
ac45abc0
PB
5112 if (ret == -ERESTARTSYS)
5113 ret = -EINTR;
93d2bcd2 5114 req_set_fail(req);
aaa4db12 5115 } else if (!fixed) {
a7083ad5
PB
5116 fd_install(fd, file);
5117 ret = fd;
aaa4db12
PB
5118 } else {
5119 ret = io_install_fixed_file(req, file, issue_flags,
5120 accept->file_slot - 1);
ac45abc0 5121 }
889fca73 5122 __io_req_complete(req, issue_flags, ret, 0);
17f2fe35 5123 return 0;
8ed8d3c3
JA
5124}
5125
93642ef8
PB
5126static int io_connect_prep_async(struct io_kiocb *req)
5127{
5128 struct io_async_connect *io = req->async_data;
5129 struct io_connect *conn = &req->connect;
5130
5131 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5132}
5133
3529d8c2 5134static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021 5135{
3529d8c2 5136 struct io_connect *conn = &req->connect;
f499a021 5137
14587a46 5138 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3fbb51c1 5139 return -EINVAL;
26578cda
PB
5140 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5141 sqe->splice_fd_in)
3fbb51c1
JA
5142 return -EINVAL;
5143
3529d8c2
JA
5144 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5145 conn->addr_len = READ_ONCE(sqe->addr2);
93642ef8 5146 return 0;
f499a021
JA
5147}
5148
889fca73 5149static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
f8e85cf2 5150{
e8c2bc1f 5151 struct io_async_connect __io, *io;
f8e85cf2 5152 unsigned file_flags;
3fbb51c1 5153 int ret;
45d189c6 5154 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
f8e85cf2 5155
d886e185 5156 if (req_has_async_data(req)) {
e8c2bc1f 5157 io = req->async_data;
f499a021 5158 } else {
3529d8c2
JA
5159 ret = move_addr_to_kernel(req->connect.addr,
5160 req->connect.addr_len,
e8c2bc1f 5161 &__io.address);
f499a021
JA
5162 if (ret)
5163 goto out;
5164 io = &__io;
5165 }
5166
3fbb51c1
JA
5167 file_flags = force_nonblock ? O_NONBLOCK : 0;
5168
e8c2bc1f 5169 ret = __sys_connect_file(req->file, &io->address,
3fbb51c1 5170 req->connect.addr_len, file_flags);
87f80d62 5171 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
d886e185 5172 if (req_has_async_data(req))
b7bb4f7d 5173 return -EAGAIN;
e8c2bc1f 5174 if (io_alloc_async_data(req)) {
f499a021
JA
5175 ret = -ENOMEM;
5176 goto out;
5177 }
e8c2bc1f 5178 memcpy(req->async_data, &__io, sizeof(__io));
f8e85cf2 5179 return -EAGAIN;
f499a021 5180 }
f8e85cf2
JA
5181 if (ret == -ERESTARTSYS)
5182 ret = -EINTR;
f499a021 5183out:
4e88d6e7 5184 if (ret < 0)
93d2bcd2 5185 req_set_fail(req);
889fca73 5186 __io_req_complete(req, issue_flags, ret, 0);
f8e85cf2 5187 return 0;
469956e8
Y
5188}
5189#else /* !CONFIG_NET */
99a10081
JA
5190#define IO_NETOP_FN(op) \
5191static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
5192{ \
5193 return -EOPNOTSUPP; \
5194}
5195
5196#define IO_NETOP_PREP(op) \
5197IO_NETOP_FN(op) \
5198static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5199{ \
5200 return -EOPNOTSUPP; \
5201} \
5202
5203#define IO_NETOP_PREP_ASYNC(op) \
5204IO_NETOP_PREP(op) \
5205static int io_##op##_prep_async(struct io_kiocb *req) \
5206{ \
5207 return -EOPNOTSUPP; \
5208}
5209
5210IO_NETOP_PREP_ASYNC(sendmsg);
5211IO_NETOP_PREP_ASYNC(recvmsg);
5212IO_NETOP_PREP_ASYNC(connect);
5213IO_NETOP_PREP(accept);
5214IO_NETOP_FN(send);
5215IO_NETOP_FN(recv);
469956e8 5216#endif /* CONFIG_NET */
f8e85cf2 5217
d7718a9d
JA
5218struct io_poll_table {
5219 struct poll_table_struct pt;
5220 struct io_kiocb *req;
68b11e8b 5221 int nr_entries;
d7718a9d
JA
5222 int error;
5223};
ce593a6c 5224
d7718a9d 5225static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
5b0a6acc 5226 __poll_t mask, io_req_tw_func_t func)
d7718a9d 5227{
d7718a9d
JA
5228 /* for instances that support it check for an event match first: */
5229 if (mask && !(mask & poll->events))
5230 return 0;
5231
5232 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
5233
5234 list_del_init(&poll->wait.entry);
5235
d7718a9d 5236 req->result = mask;
5b0a6acc 5237 req->io_task_work.func = func;
6d816e08 5238
d7718a9d 5239 /*
e3aabf95
JA
5240 * If this fails, then the task is exiting. When a task exits, the
5241 * work gets canceled, so just cancel this request as well instead
5242 * of executing it. We can't safely execute it anyway, as we may not
5243 * have the needed state needed for it anyway.
d7718a9d 5244 */
e09ee510 5245 io_req_task_work_add(req);
d7718a9d
JA
5246 return 1;
5247}
5248
74ce6ce4
JA
5249static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
5250 __acquires(&req->ctx->completion_lock)
5251{
5252 struct io_ring_ctx *ctx = req->ctx;
5253
316319e8 5254 /* req->task == current here, checking PF_EXITING is safe */
e09ee510
PB
5255 if (unlikely(req->task->flags & PF_EXITING))
5256 WRITE_ONCE(poll->canceled, true);
5257
74ce6ce4
JA
5258 if (!req->result && !READ_ONCE(poll->canceled)) {
5259 struct poll_table_struct pt = { ._key = poll->events };
5260
5261 req->result = vfs_poll(req->file, &pt) & poll->events;
5262 }
5263
79ebeaee 5264 spin_lock(&ctx->completion_lock);
74ce6ce4
JA
5265 if (!req->result && !READ_ONCE(poll->canceled)) {
5266 add_wait_queue(poll->head, &poll->wait);
5267 return true;
5268 }
5269
5270 return false;
5271}
5272
d4e7cd36 5273static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
18bceab1 5274{
e8c2bc1f 5275 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
d4e7cd36 5276 if (req->opcode == IORING_OP_POLL_ADD)
e8c2bc1f 5277 return req->async_data;
d4e7cd36
JA
5278 return req->apoll->double_poll;
5279}
5280
5281static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5282{
5283 if (req->opcode == IORING_OP_POLL_ADD)
5284 return &req->poll;
5285 return &req->apoll->poll;
5286}
5287
5288static void io_poll_remove_double(struct io_kiocb *req)
e07785b0 5289 __must_hold(&req->ctx->completion_lock)
d4e7cd36
JA
5290{
5291 struct io_poll_iocb *poll = io_poll_get_double(req);
18bceab1
JA
5292
5293 lockdep_assert_held(&req->ctx->completion_lock);
5294
5295 if (poll && poll->head) {
5296 struct wait_queue_head *head = poll->head;
5297
79ebeaee 5298 spin_lock_irq(&head->lock);
18bceab1
JA
5299 list_del_init(&poll->wait.entry);
5300 if (poll->wait.private)
de9b4cca 5301 req_ref_put(req);
18bceab1 5302 poll->head = NULL;
79ebeaee 5303 spin_unlock_irq(&head->lock);
18bceab1
JA
5304 }
5305}
5306
31efe48e 5307static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
e07785b0 5308 __must_hold(&req->ctx->completion_lock)
18bceab1
JA
5309{
5310 struct io_ring_ctx *ctx = req->ctx;
88e41cf9 5311 unsigned flags = IORING_CQE_F_MORE;
e27414be 5312 int error;
18bceab1 5313
e27414be 5314 if (READ_ONCE(req->poll.canceled)) {
45ab03b1 5315 error = -ECANCELED;
88e41cf9 5316 req->poll.events |= EPOLLONESHOT;
e27414be 5317 } else {
5082620f 5318 error = mangle_poll(mask);
e27414be 5319 }
b69de288
JA
5320 if (req->poll.events & EPOLLONESHOT)
5321 flags = 0;
a62682f9
HX
5322 if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
5323 req->poll.events |= EPOLLONESHOT;
88e41cf9 5324 flags = 0;
a62682f9 5325 }
7b289c38
HX
5326 if (flags & IORING_CQE_F_MORE)
5327 ctx->cq_extra++;
18bceab1 5328
88e41cf9 5329 return !(flags & IORING_CQE_F_MORE);
18bceab1
JA
5330}
5331
f237c30a 5332static void io_poll_task_func(struct io_kiocb *req, bool *locked)
18bceab1
JA
5333{
5334 struct io_ring_ctx *ctx = req->ctx;
dd221f46 5335 struct io_kiocb *nxt;
18bceab1
JA
5336
5337 if (io_poll_rewait(req, &req->poll)) {
79ebeaee 5338 spin_unlock(&ctx->completion_lock);
dd221f46 5339 } else {
f40b964a 5340 bool done;
18bceab1 5341
5b7aa38d
HX
5342 if (req->poll.done) {
5343 spin_unlock(&ctx->completion_lock);
5344 return;
5345 }
31efe48e 5346 done = __io_poll_complete(req, req->result);
88e41cf9 5347 if (done) {
a890d01e 5348 io_poll_remove_double(req);
88e41cf9 5349 hash_del(&req->hash_node);
bd99c71b 5350 req->poll.done = true;
f40b964a 5351 } else {
88e41cf9
JA
5352 req->result = 0;
5353 add_wait_queue(req->poll.head, &req->poll.wait);
5354 }
31efe48e 5355 io_commit_cqring(ctx);
79ebeaee 5356 spin_unlock(&ctx->completion_lock);
dd221f46 5357 io_cqring_ev_posted(ctx);
18bceab1 5358
88e41cf9
JA
5359 if (done) {
5360 nxt = io_put_req_find_next(req);
5361 if (nxt)
f237c30a 5362 io_req_task_submit(nxt, locked);
88e41cf9 5363 }
dd221f46 5364 }
18bceab1
JA
5365}
5366
5367static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
5368 int sync, void *key)
5369{
5370 struct io_kiocb *req = wait->private;
d4e7cd36 5371 struct io_poll_iocb *poll = io_poll_get_single(req);
18bceab1 5372 __poll_t mask = key_to_poll(key);
79ebeaee 5373 unsigned long flags;
18bceab1
JA
5374
5375 /* for instances that support it check for an event match first: */
5376 if (mask && !(mask & poll->events))
5377 return 0;
88e41cf9
JA
5378 if (!(poll->events & EPOLLONESHOT))
5379 return poll->wait.func(&poll->wait, mode, sync, key);
18bceab1 5380
8706e04e
JA
5381 list_del_init(&wait->entry);
5382
9ce85ef2 5383 if (poll->head) {
18bceab1
JA
5384 bool done;
5385
79ebeaee 5386 spin_lock_irqsave(&poll->head->lock, flags);
807abcb0 5387 done = list_empty(&poll->wait.entry);
18bceab1 5388 if (!done)
807abcb0 5389 list_del_init(&poll->wait.entry);
d4e7cd36
JA
5390 /* make sure double remove sees this as being gone */
5391 wait->private = NULL;
79ebeaee 5392 spin_unlock_irqrestore(&poll->head->lock, flags);
c8b5e260
JA
5393 if (!done) {
5394 /* use wait func handler, so it matches the rq type */
5395 poll->wait.func(&poll->wait, mode, sync, key);
5396 }
18bceab1 5397 }
de9b4cca 5398 req_ref_put(req);
18bceab1
JA
5399 return 1;
5400}
5401
5402static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5403 wait_queue_func_t wake_func)
5404{
5405 poll->head = NULL;
5406 poll->done = false;
5407 poll->canceled = false;
464dca61
JA
5408#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5409 /* mask in events that we always want/need */
5410 poll->events = events | IO_POLL_UNMASK;
18bceab1
JA
5411 INIT_LIST_HEAD(&poll->wait.entry);
5412 init_waitqueue_func_entry(&poll->wait, wake_func);
5413}
5414
5415static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
807abcb0
JA
5416 struct wait_queue_head *head,
5417 struct io_poll_iocb **poll_ptr)
18bceab1
JA
5418{
5419 struct io_kiocb *req = pt->req;
5420
5421 /*
68b11e8b
PB
5422 * The file being polled uses multiple waitqueues for poll handling
5423 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
5424 * if this happens.
18bceab1 5425 */
68b11e8b 5426 if (unlikely(pt->nr_entries)) {
58852d4d
PB
5427 struct io_poll_iocb *poll_one = poll;
5428
23a65db8
PB
5429 /* double add on the same waitqueue head, ignore */
5430 if (poll_one->head == head)
5431 return;
18bceab1 5432 /* already have a 2nd entry, fail a third attempt */
807abcb0 5433 if (*poll_ptr) {
23a65db8
PB
5434 if ((*poll_ptr)->head == head)
5435 return;
18bceab1
JA
5436 pt->error = -EINVAL;
5437 return;
5438 }
ea6a693d
JA
5439 /*
5440 * Can't handle multishot for double wait for now, turn it
5441 * into one-shot mode.
5442 */
7a274727
PB
5443 if (!(poll_one->events & EPOLLONESHOT))
5444 poll_one->events |= EPOLLONESHOT;
18bceab1
JA
5445 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5446 if (!poll) {
5447 pt->error = -ENOMEM;
5448 return;
5449 }
58852d4d 5450 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
de9b4cca 5451 req_ref_get(req);
18bceab1 5452 poll->wait.private = req;
d886e185 5453
807abcb0 5454 *poll_ptr = poll;
d886e185
PB
5455 if (req->opcode == IORING_OP_POLL_ADD)
5456 req->flags |= REQ_F_ASYNC_DATA;
18bceab1
JA
5457 }
5458
68b11e8b 5459 pt->nr_entries++;
18bceab1 5460 poll->head = head;
a31eb4a2
JX
5461
5462 if (poll->events & EPOLLEXCLUSIVE)
5463 add_wait_queue_exclusive(head, &poll->wait);
5464 else
5465 add_wait_queue(head, &poll->wait);
18bceab1
JA
5466}
5467
5468static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5469 struct poll_table_struct *p)
5470{
5471 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
807abcb0 5472 struct async_poll *apoll = pt->req->apoll;
18bceab1 5473
807abcb0 5474 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
18bceab1
JA
5475}
5476
f237c30a 5477static void io_async_task_func(struct io_kiocb *req, bool *locked)
d7718a9d 5478{
d7718a9d
JA
5479 struct async_poll *apoll = req->apoll;
5480 struct io_ring_ctx *ctx = req->ctx;
5481
236daeae 5482 trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
d7718a9d 5483
74ce6ce4 5484 if (io_poll_rewait(req, &apoll->poll)) {
79ebeaee 5485 spin_unlock(&ctx->completion_lock);
74ce6ce4 5486 return;
d7718a9d
JA
5487 }
5488
0ea13b44 5489 hash_del(&req->hash_node);
d4e7cd36 5490 io_poll_remove_double(req);
bd99c71b 5491 apoll->poll.done = true;
79ebeaee 5492 spin_unlock(&ctx->completion_lock);
74ce6ce4 5493
0be0b0e3 5494 if (!READ_ONCE(apoll->poll.canceled))
f237c30a 5495 io_req_task_submit(req, locked);
0be0b0e3 5496 else
2593553a 5497 io_req_complete_failed(req, -ECANCELED);
d7718a9d
JA
5498}
5499
5500static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5501 void *key)
5502{
5503 struct io_kiocb *req = wait->private;
5504 struct io_poll_iocb *poll = &req->apoll->poll;
5505
5506 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5507 key_to_poll(key));
5508
5509 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5510}
5511
5512static void io_poll_req_insert(struct io_kiocb *req)
5513{
5514 struct io_ring_ctx *ctx = req->ctx;
5515 struct hlist_head *list;
5516
5517 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5518 hlist_add_head(&req->hash_node, list);
5519}
5520
5521static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5522 struct io_poll_iocb *poll,
5523 struct io_poll_table *ipt, __poll_t mask,
5524 wait_queue_func_t wake_func)
5525 __acquires(&ctx->completion_lock)
5526{
5527 struct io_ring_ctx *ctx = req->ctx;
5528 bool cancel = false;
5529
4d52f338 5530 INIT_HLIST_NODE(&req->hash_node);
18bceab1 5531 io_init_poll_iocb(poll, mask, wake_func);
b90cd197 5532 poll->file = req->file;
18bceab1 5533 poll->wait.private = req;
d7718a9d
JA
5534
5535 ipt->pt._key = mask;
5536 ipt->req = req;
68b11e8b
PB
5537 ipt->error = 0;
5538 ipt->nr_entries = 0;
d7718a9d 5539
d7718a9d 5540 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
68b11e8b
PB
5541 if (unlikely(!ipt->nr_entries) && !ipt->error)
5542 ipt->error = -EINVAL;
d7718a9d 5543
79ebeaee 5544 spin_lock(&ctx->completion_lock);
a890d01e 5545 if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
46fee9ab 5546 io_poll_remove_double(req);
d7718a9d 5547 if (likely(poll->head)) {
79ebeaee 5548 spin_lock_irq(&poll->head->lock);
d7718a9d
JA
5549 if (unlikely(list_empty(&poll->wait.entry))) {
5550 if (ipt->error)
5551 cancel = true;
5552 ipt->error = 0;
5553 mask = 0;
5554 }
88e41cf9 5555 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
d7718a9d
JA
5556 list_del_init(&poll->wait.entry);
5557 else if (cancel)
5558 WRITE_ONCE(poll->canceled, true);
5559 else if (!poll->done) /* actually waiting for an event */
5560 io_poll_req_insert(req);
79ebeaee 5561 spin_unlock_irq(&poll->head->lock);
d7718a9d
JA
5562 }
5563
5564 return mask;
5565}
5566
59b735ae
OL
5567enum {
5568 IO_APOLL_OK,
5569 IO_APOLL_ABORTED,
5570 IO_APOLL_READY
5571};
5572
5573static int io_arm_poll_handler(struct io_kiocb *req)
d7718a9d
JA
5574{
5575 const struct io_op_def *def = &io_op_defs[req->opcode];
5576 struct io_ring_ctx *ctx = req->ctx;
5577 struct async_poll *apoll;
5578 struct io_poll_table ipt;
b2d9c3da 5579 __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
9dab14b8 5580 int rw;
d7718a9d
JA
5581
5582 if (!req->file || !file_can_poll(req->file))
59b735ae 5583 return IO_APOLL_ABORTED;
24c74678 5584 if (req->flags & REQ_F_POLLED)
59b735ae 5585 return IO_APOLL_ABORTED;
b2d9c3da
PB
5586 if (!def->pollin && !def->pollout)
5587 return IO_APOLL_ABORTED;
5588
5589 if (def->pollin) {
9dab14b8 5590 rw = READ;
b2d9c3da
PB
5591 mask |= POLLIN | POLLRDNORM;
5592
5593 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5594 if ((req->opcode == IORING_OP_RECVMSG) &&
5595 (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5596 mask &= ~POLLIN;
5597 } else {
9dab14b8 5598 rw = WRITE;
b2d9c3da
PB
5599 mask |= POLLOUT | POLLWRNORM;
5600 }
5601
9dab14b8 5602 /* if we can't nonblock try, then no point in arming a poll handler */
b191e2df 5603 if (!io_file_supports_nowait(req, rw))
59b735ae 5604 return IO_APOLL_ABORTED;
d7718a9d
JA
5605
5606 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5607 if (unlikely(!apoll))
59b735ae 5608 return IO_APOLL_ABORTED;
807abcb0 5609 apoll->double_poll = NULL;
d7718a9d 5610 req->apoll = apoll;
b2d9c3da 5611 req->flags |= REQ_F_POLLED;
d7718a9d 5612 ipt.pt._qproc = io_async_queue_proc;
48dcd38d 5613 io_req_set_refcount(req);
d7718a9d
JA
5614
5615 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5616 io_async_wake);
79ebeaee 5617 spin_unlock(&ctx->completion_lock);
41a5169c
HX
5618 if (ret || ipt.error)
5619 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
5620
236daeae
OL
5621 trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5622 mask, apoll->poll.events);
59b735ae 5623 return IO_APOLL_OK;
d7718a9d
JA
5624}
5625
5626static bool __io_poll_remove_one(struct io_kiocb *req,
b2e720ac 5627 struct io_poll_iocb *poll, bool do_cancel)
e07785b0 5628 __must_hold(&req->ctx->completion_lock)
221c5eb2 5629{
b41e9852 5630 bool do_complete = false;
221c5eb2 5631
5082620f
JA
5632 if (!poll->head)
5633 return false;
79ebeaee 5634 spin_lock_irq(&poll->head->lock);
b2e720ac
JA
5635 if (do_cancel)
5636 WRITE_ONCE(poll->canceled, true);
392edb45
JA
5637 if (!list_empty(&poll->wait.entry)) {
5638 list_del_init(&poll->wait.entry);
b41e9852 5639 do_complete = true;
221c5eb2 5640 }
79ebeaee 5641 spin_unlock_irq(&poll->head->lock);
3bfa5bcb 5642 hash_del(&req->hash_node);
d7718a9d
JA
5643 return do_complete;
5644}
5645
5d709043 5646static bool io_poll_remove_one(struct io_kiocb *req)
e07785b0 5647 __must_hold(&req->ctx->completion_lock)
d7718a9d
JA
5648{
5649 bool do_complete;
5650
d4e7cd36 5651 io_poll_remove_double(req);
e31001a3 5652 do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
d4e7cd36 5653
b41e9852 5654 if (do_complete) {
d4d19c19 5655 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
b41e9852 5656 io_commit_cqring(req->ctx);
93d2bcd2 5657 req_set_fail(req);
91c2f697 5658 io_put_req_deferred(req);
5d709043 5659 }
b41e9852 5660 return do_complete;
221c5eb2
JA
5661}
5662
76e1b642
JA
5663/*
5664 * Returns true if we found and killed one or more poll requests
5665 */
c072481d
PB
5666static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
5667 struct task_struct *tsk, bool cancel_all)
221c5eb2 5668{
78076bb6 5669 struct hlist_node *tmp;
221c5eb2 5670 struct io_kiocb *req;
8e2e1faf 5671 int posted = 0, i;
221c5eb2 5672
79ebeaee 5673 spin_lock(&ctx->completion_lock);
78076bb6
JA
5674 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5675 struct hlist_head *list;
5676
5677 list = &ctx->cancel_hash[i];
f3606e3a 5678 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
3dd0c97a 5679 if (io_match_task(req, tsk, cancel_all))
f3606e3a
JA
5680 posted += io_poll_remove_one(req);
5681 }
221c5eb2 5682 }
79ebeaee 5683 spin_unlock(&ctx->completion_lock);
b41e9852 5684
8e2e1faf
JA
5685 if (posted)
5686 io_cqring_ev_posted(ctx);
76e1b642
JA
5687
5688 return posted != 0;
221c5eb2
JA
5689}
5690
9ba5fac8
PB
5691static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5692 bool poll_only)
e07785b0 5693 __must_hold(&ctx->completion_lock)
47f46768 5694{
78076bb6 5695 struct hlist_head *list;
47f46768
JA
5696 struct io_kiocb *req;
5697
78076bb6
JA
5698 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5699 hlist_for_each_entry(req, list, hash_node) {
b41e9852
JA
5700 if (sqe_addr != req->user_data)
5701 continue;
9ba5fac8
PB
5702 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5703 continue;
b2cb805f 5704 return req;
47f46768 5705 }
b2cb805f
JA
5706 return NULL;
5707}
5708
9ba5fac8
PB
5709static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5710 bool poll_only)
e07785b0 5711 __must_hold(&ctx->completion_lock)
b2cb805f
JA
5712{
5713 struct io_kiocb *req;
5714
9ba5fac8 5715 req = io_poll_find(ctx, sqe_addr, poll_only);
b2cb805f
JA
5716 if (!req)
5717 return -ENOENT;
5718 if (io_poll_remove_one(req))
5719 return 0;
5720
5721 return -EALREADY;
47f46768
JA
5722}
5723
9096af3e
PB
5724static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5725 unsigned int flags)
5726{
5727 u32 events;
47f46768 5728
9096af3e
PB
5729 events = READ_ONCE(sqe->poll32_events);
5730#ifdef __BIG_ENDIAN
5731 events = swahw32(events);
5732#endif
5733 if (!(flags & IORING_POLL_ADD_MULTI))
5734 events |= EPOLLONESHOT;
5735 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
47f46768
JA
5736}
5737
c5de0036 5738static int io_poll_update_prep(struct io_kiocb *req,
3529d8c2 5739 const struct io_uring_sqe *sqe)
0969e783 5740{
c5de0036
PB
5741 struct io_poll_update *upd = &req->poll_update;
5742 u32 flags;
5743
0969e783
JA
5744 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5745 return -EINVAL;
26578cda 5746 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
c5de0036
PB
5747 return -EINVAL;
5748 flags = READ_ONCE(sqe->len);
5749 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5750 IORING_POLL_ADD_MULTI))
5751 return -EINVAL;
5752 /* meaningless without update */
5753 if (flags == IORING_POLL_ADD_MULTI)
0969e783
JA
5754 return -EINVAL;
5755
c5de0036
PB
5756 upd->old_user_data = READ_ONCE(sqe->addr);
5757 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5758 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
221c5eb2 5759
c5de0036
PB
5760 upd->new_user_data = READ_ONCE(sqe->off);
5761 if (!upd->update_user_data && upd->new_user_data)
5762 return -EINVAL;
5763 if (upd->update_events)
5764 upd->events = io_poll_parse_events(sqe, flags);
5765 else if (sqe->poll32_events)
5766 return -EINVAL;
221c5eb2 5767
221c5eb2
JA
5768 return 0;
5769}
5770
221c5eb2
JA
5771static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5772 void *key)
5773{
c2f2eb7d
JA
5774 struct io_kiocb *req = wait->private;
5775 struct io_poll_iocb *poll = &req->poll;
221c5eb2 5776
d7718a9d 5777 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
221c5eb2
JA
5778}
5779
221c5eb2
JA
5780static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5781 struct poll_table_struct *p)
5782{
5783 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5784
e8c2bc1f 5785 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
eac406c6
JA
5786}
5787
3529d8c2 5788static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
5789{
5790 struct io_poll_iocb *poll = &req->poll;
c5de0036 5791 u32 flags;
221c5eb2
JA
5792
5793 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5794 return -EINVAL;
c5de0036 5795 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
88e41cf9
JA
5796 return -EINVAL;
5797 flags = READ_ONCE(sqe->len);
c5de0036 5798 if (flags & ~IORING_POLL_ADD_MULTI)
221c5eb2
JA
5799 return -EINVAL;
5800
48dcd38d 5801 io_req_set_refcount(req);
c5de0036 5802 poll->events = io_poll_parse_events(sqe, flags);
0969e783
JA
5803 return 0;
5804}
5805
61e98203 5806static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
0969e783
JA
5807{
5808 struct io_poll_iocb *poll = &req->poll;
5809 struct io_ring_ctx *ctx = req->ctx;
5810 struct io_poll_table ipt;
0969e783 5811 __poll_t mask;
5b7aa38d 5812 bool done;
0969e783 5813
d7718a9d 5814 ipt.pt._qproc = io_poll_queue_proc;
36703247 5815
d7718a9d
JA
5816 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5817 io_poll_wake);
221c5eb2 5818
8c838788 5819 if (mask) { /* no async, we'd stolen it */
221c5eb2 5820 ipt.error = 0;
eb6e6f06
PB
5821 done = __io_poll_complete(req, mask);
5822 io_commit_cqring(req->ctx);
221c5eb2 5823 }
79ebeaee 5824 spin_unlock(&ctx->completion_lock);
221c5eb2 5825
8c838788
JA
5826 if (mask) {
5827 io_cqring_ev_posted(ctx);
5b7aa38d 5828 if (done)
88e41cf9 5829 io_put_req(req);
221c5eb2 5830 }
8c838788 5831 return ipt.error;
221c5eb2
JA
5832}
5833
c5de0036 5834static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
b69de288
JA
5835{
5836 struct io_ring_ctx *ctx = req->ctx;
5837 struct io_kiocb *preq;
cb3b200e 5838 bool completing;
b69de288
JA
5839 int ret;
5840
79ebeaee 5841 spin_lock(&ctx->completion_lock);
9ba5fac8 5842 preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
b69de288
JA
5843 if (!preq) {
5844 ret = -ENOENT;
5845 goto err;
b69de288 5846 }
cb3b200e 5847
c5de0036
PB
5848 if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5849 completing = true;
5850 ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5851 goto err;
5852 }
5853
cb3b200e
JA
5854 /*
5855 * Don't allow racy completion with singleshot, as we cannot safely
5856 * update those. For multishot, if we're racing with completion, just
5857 * let completion re-add it.
5858 */
5859 completing = !__io_poll_remove_one(preq, &preq->poll, false);
5860 if (completing && (preq->poll.events & EPOLLONESHOT)) {
5861 ret = -EALREADY;
5862 goto err;
b69de288
JA
5863 }
5864 /* we now have a detached poll request. reissue. */
5865 ret = 0;
5866err:
b69de288 5867 if (ret < 0) {
79ebeaee 5868 spin_unlock(&ctx->completion_lock);
93d2bcd2 5869 req_set_fail(req);
b69de288
JA
5870 io_req_complete(req, ret);
5871 return 0;
5872 }
5873 /* only mask one event flags, keep behavior flags */
9d805892 5874 if (req->poll_update.update_events) {
b69de288 5875 preq->poll.events &= ~0xffff;
9d805892 5876 preq->poll.events |= req->poll_update.events & 0xffff;
b69de288
JA
5877 preq->poll.events |= IO_POLL_UNMASK;
5878 }
9d805892
PB
5879 if (req->poll_update.update_user_data)
5880 preq->user_data = req->poll_update.new_user_data;
79ebeaee 5881 spin_unlock(&ctx->completion_lock);
cb3b200e 5882
b69de288
JA
5883 /* complete update request, we're done with it */
5884 io_req_complete(req, ret);
5885
cb3b200e 5886 if (!completing) {
c5de0036 5887 ret = io_poll_add(preq, issue_flags);
cb3b200e 5888 if (ret < 0) {
93d2bcd2 5889 req_set_fail(preq);
cb3b200e
JA
5890 io_req_complete(preq, ret);
5891 }
b69de288
JA
5892 }
5893 return 0;
5894}
5895
f237c30a 5896static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
89850fce 5897{
6224590d
PB
5898 struct io_timeout_data *data = req->async_data;
5899
5900 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
5901 req_set_fail(req);
505657bc 5902 io_req_complete_post(req, -ETIME, 0);
89850fce
JA
5903}
5904
5262f567
JA
5905static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5906{
ad8a48ac
JA
5907 struct io_timeout_data *data = container_of(timer,
5908 struct io_timeout_data, timer);
5909 struct io_kiocb *req = data->req;
5910 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
5911 unsigned long flags;
5912
89850fce 5913 spin_lock_irqsave(&ctx->timeout_lock, flags);
a71976f3 5914 list_del_init(&req->timeout.list);
01cec8c1
PB
5915 atomic_set(&req->ctx->cq_timeouts,
5916 atomic_read(&req->ctx->cq_timeouts) + 1);
89850fce 5917 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
01cec8c1 5918
89850fce
JA
5919 req->io_task_work.func = io_req_task_timeout;
5920 io_req_task_work_add(req);
5262f567
JA
5921 return HRTIMER_NORESTART;
5922}
5923
fbd15848
PB
5924static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5925 __u64 user_data)
89850fce 5926 __must_hold(&ctx->timeout_lock)
f254ac04 5927{
fbd15848 5928 struct io_timeout_data *io;
47f46768 5929 struct io_kiocb *req;
fd9c7bc5 5930 bool found = false;
f254ac04 5931
135fcde8 5932 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
fd9c7bc5
PB
5933 found = user_data == req->user_data;
5934 if (found)
47f46768 5935 break;
47f46768 5936 }
fd9c7bc5
PB
5937 if (!found)
5938 return ERR_PTR(-ENOENT);
fbd15848
PB
5939
5940 io = req->async_data;
fd9c7bc5 5941 if (hrtimer_try_to_cancel(&io->timer) == -1)
fbd15848 5942 return ERR_PTR(-EALREADY);
a71976f3 5943 list_del_init(&req->timeout.list);
fbd15848
PB
5944 return req;
5945}
47f46768 5946
fbd15848 5947static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
ec3c3d0f 5948 __must_hold(&ctx->completion_lock)
89850fce 5949 __must_hold(&ctx->timeout_lock)
fbd15848
PB
5950{
5951 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5952
5953 if (IS_ERR(req))
5954 return PTR_ERR(req);
f254ac04 5955
93d2bcd2 5956 req_set_fail(req);
d4d19c19 5957 io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
91c2f697 5958 io_put_req_deferred(req);
f254ac04
JA
5959 return 0;
5960}
5961
50c1df2b
JA
5962static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
5963{
5964 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
5965 case IORING_TIMEOUT_BOOTTIME:
5966 return CLOCK_BOOTTIME;
5967 case IORING_TIMEOUT_REALTIME:
5968 return CLOCK_REALTIME;
5969 default:
5970 /* can't happen, vetted at prep time */
5971 WARN_ON_ONCE(1);
5972 fallthrough;
5973 case 0:
5974 return CLOCK_MONOTONIC;
5975 }
5976}
5977
f1042b6c
PB
5978static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5979 struct timespec64 *ts, enum hrtimer_mode mode)
5980 __must_hold(&ctx->timeout_lock)
5981{
5982 struct io_timeout_data *io;
5983 struct io_kiocb *req;
5984 bool found = false;
5985
5986 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
5987 found = user_data == req->user_data;
5988 if (found)
5989 break;
5990 }
5991 if (!found)
5992 return -ENOENT;
5993
5994 io = req->async_data;
5995 if (hrtimer_try_to_cancel(&io->timer) == -1)
5996 return -EALREADY;
5997 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
5998 io->timer.function = io_link_timeout_fn;
5999 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
6000 return 0;
6001}
6002
9c8e11b3
PB
6003static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6004 struct timespec64 *ts, enum hrtimer_mode mode)
89850fce 6005 __must_hold(&ctx->timeout_lock)
47f46768 6006{
9c8e11b3
PB
6007 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6008 struct io_timeout_data *data;
47f46768 6009
9c8e11b3
PB
6010 if (IS_ERR(req))
6011 return PTR_ERR(req);
47f46768 6012
9c8e11b3
PB
6013 req->timeout.off = 0; /* noseq */
6014 data = req->async_data;
6015 list_add_tail(&req->timeout.list, &ctx->timeout_list);
50c1df2b 6016 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
9c8e11b3
PB
6017 data->timer.function = io_timeout_fn;
6018 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
6019 return 0;
47f46768
JA
6020}
6021
3529d8c2
JA
6022static int io_timeout_remove_prep(struct io_kiocb *req,
6023 const struct io_uring_sqe *sqe)
b29472ee 6024{
9c8e11b3
PB
6025 struct io_timeout_rem *tr = &req->timeout_rem;
6026
b29472ee
JA
6027 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6028 return -EINVAL;
61710e43
DA
6029 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6030 return -EINVAL;
26578cda 6031 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
b29472ee
JA
6032 return -EINVAL;
6033
f1042b6c 6034 tr->ltimeout = false;
9c8e11b3
PB
6035 tr->addr = READ_ONCE(sqe->addr);
6036 tr->flags = READ_ONCE(sqe->timeout_flags);
f1042b6c
PB
6037 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
6038 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6039 return -EINVAL;
6040 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
6041 tr->ltimeout = true;
6042 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
9c8e11b3
PB
6043 return -EINVAL;
6044 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6045 return -EFAULT;
6046 } else if (tr->flags) {
6047 /* timeout removal doesn't support flags */
b29472ee 6048 return -EINVAL;
9c8e11b3 6049 }
b29472ee 6050
b29472ee
JA
6051 return 0;
6052}
6053
8662daec
PB
6054static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6055{
6056 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6057 : HRTIMER_MODE_REL;
6058}
6059
11365043
JA
6060/*
6061 * Remove or update an existing timeout command
6062 */
61e98203 6063static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
11365043 6064{
9c8e11b3 6065 struct io_timeout_rem *tr = &req->timeout_rem;
11365043 6066 struct io_ring_ctx *ctx = req->ctx;
47f46768 6067 int ret;
11365043 6068
ec3c3d0f
PB
6069 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6070 spin_lock(&ctx->completion_lock);
6071 spin_lock_irq(&ctx->timeout_lock);
9c8e11b3 6072 ret = io_timeout_cancel(ctx, tr->addr);
ec3c3d0f
PB
6073 spin_unlock_irq(&ctx->timeout_lock);
6074 spin_unlock(&ctx->completion_lock);
6075 } else {
f1042b6c
PB
6076 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6077
ec3c3d0f 6078 spin_lock_irq(&ctx->timeout_lock);
f1042b6c
PB
6079 if (tr->ltimeout)
6080 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6081 else
6082 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
ec3c3d0f
PB
6083 spin_unlock_irq(&ctx->timeout_lock);
6084 }
11365043 6085
4e88d6e7 6086 if (ret < 0)
93d2bcd2 6087 req_set_fail(req);
505657bc 6088 io_req_complete_post(req, ret, 0);
11365043 6089 return 0;
5262f567
JA
6090}
6091
3529d8c2 6092static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 6093 bool is_timeout_link)
5262f567 6094{
ad8a48ac 6095 struct io_timeout_data *data;
a41525ab 6096 unsigned flags;
56080b02 6097 u32 off = READ_ONCE(sqe->off);
5262f567 6098
ad8a48ac 6099 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 6100 return -EINVAL;
26578cda
PB
6101 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6102 sqe->splice_fd_in)
a41525ab 6103 return -EINVAL;
56080b02 6104 if (off && is_timeout_link)
2d28390a 6105 return -EINVAL;
a41525ab 6106 flags = READ_ONCE(sqe->timeout_flags);
6224590d
PB
6107 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
6108 IORING_TIMEOUT_ETIME_SUCCESS))
50c1df2b
JA
6109 return -EINVAL;
6110 /* more than one clock specified is invalid, obviously */
6111 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
5262f567 6112 return -EINVAL;
bdf20073 6113
ef9dd637 6114 INIT_LIST_HEAD(&req->timeout.list);
bfe68a22 6115 req->timeout.off = off;
f18ee4cf
PB
6116 if (unlikely(off && !req->ctx->off_timeout_used))
6117 req->ctx->off_timeout_used = true;
26a61679 6118
d886e185 6119 if (!req_has_async_data(req) && io_alloc_async_data(req))
26a61679
JA
6120 return -ENOMEM;
6121
e8c2bc1f 6122 data = req->async_data;
ad8a48ac 6123 data->req = req;
50c1df2b 6124 data->flags = flags;
ad8a48ac
JA
6125
6126 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
6127 return -EFAULT;
6128
8662daec 6129 data->mode = io_translate_timeout_mode(flags);
50c1df2b 6130 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
b97e736a
PB
6131
6132 if (is_timeout_link) {
6133 struct io_submit_link *link = &req->ctx->submit_state.link;
6134
6135 if (!link->head)
6136 return -EINVAL;
6137 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6138 return -EINVAL;
4d13d1a4
PB
6139 req->timeout.head = link->last;
6140 link->last->flags |= REQ_F_ARM_LTIMEOUT;
b97e736a 6141 }
ad8a48ac
JA
6142 return 0;
6143}
6144
61e98203 6145static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
ad8a48ac 6146{
ad8a48ac 6147 struct io_ring_ctx *ctx = req->ctx;
e8c2bc1f 6148 struct io_timeout_data *data = req->async_data;
ad8a48ac 6149 struct list_head *entry;
bfe68a22 6150 u32 tail, off = req->timeout.off;
ad8a48ac 6151
89850fce 6152 spin_lock_irq(&ctx->timeout_lock);
93bd25bb 6153
5262f567
JA
6154 /*
6155 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
6156 * timeout event to be satisfied. If it isn't set, then this is
6157 * a pure timeout request, sequence isn't used.
5262f567 6158 */
8eb7e2d0 6159 if (io_is_timeout_noseq(req)) {
93bd25bb
JA
6160 entry = ctx->timeout_list.prev;
6161 goto add;
6162 }
5262f567 6163
bfe68a22
PB
6164 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6165 req->timeout.target_seq = tail + off;
5262f567 6166
f010505b
MDG
6167 /* Update the last seq here in case io_flush_timeouts() hasn't.
6168 * This is safe because ->completion_lock is held, and submissions
6169 * and completions are never mixed in the same ->completion_lock section.
6170 */
6171 ctx->cq_last_tm_flush = tail;
6172
5262f567
JA
6173 /*
6174 * Insertion sort, ensuring the first entry in the list is always
6175 * the one we need first.
6176 */
5262f567 6177 list_for_each_prev(entry, &ctx->timeout_list) {
135fcde8
PB
6178 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6179 timeout.list);
5262f567 6180
8eb7e2d0 6181 if (io_is_timeout_noseq(nxt))
93bd25bb 6182 continue;
bfe68a22
PB
6183 /* nxt.seq is behind @tail, otherwise would've been completed */
6184 if (off >= nxt->timeout.target_seq - tail)
5262f567
JA
6185 break;
6186 }
93bd25bb 6187add:
135fcde8 6188 list_add(&req->timeout.list, entry);
ad8a48ac
JA
6189 data->timer.function = io_timeout_fn;
6190 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
89850fce 6191 spin_unlock_irq(&ctx->timeout_lock);
5262f567
JA
6192 return 0;
6193}
5262f567 6194
f458dd84
PB
6195struct io_cancel_data {
6196 struct io_ring_ctx *ctx;
6197 u64 user_data;
6198};
6199
62755e35
JA
6200static bool io_cancel_cb(struct io_wq_work *work, void *data)
6201{
6202 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f458dd84 6203 struct io_cancel_data *cd = data;
62755e35 6204
f458dd84 6205 return req->ctx == cd->ctx && req->user_data == cd->user_data;
62755e35
JA
6206}
6207
f458dd84
PB
6208static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6209 struct io_ring_ctx *ctx)
62755e35 6210{
f458dd84 6211 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
62755e35 6212 enum io_wq_cancel cancel_ret;
62755e35
JA
6213 int ret = 0;
6214
f458dd84 6215 if (!tctx || !tctx->io_wq)
5aa75ed5
JA
6216 return -ENOENT;
6217
f458dd84 6218 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
62755e35
JA
6219 switch (cancel_ret) {
6220 case IO_WQ_CANCEL_OK:
6221 ret = 0;
6222 break;
6223 case IO_WQ_CANCEL_RUNNING:
6224 ret = -EALREADY;
6225 break;
6226 case IO_WQ_CANCEL_NOTFOUND:
6227 ret = -ENOENT;
6228 break;
6229 }
6230
e977d6d3
JA
6231 return ret;
6232}
6233
8cb01fac 6234static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
47f46768 6235{
8cb01fac 6236 struct io_ring_ctx *ctx = req->ctx;
47f46768
JA
6237 int ret;
6238
dadebc35 6239 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
8cb01fac 6240
f458dd84 6241 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
df9727af 6242 if (ret != -ENOENT)
8cb01fac 6243 return ret;
505657bc
PB
6244
6245 spin_lock(&ctx->completion_lock);
79ebeaee 6246 spin_lock_irq(&ctx->timeout_lock);
47f46768 6247 ret = io_timeout_cancel(ctx, sqe_addr);
79ebeaee 6248 spin_unlock_irq(&ctx->timeout_lock);
47f46768 6249 if (ret != -ENOENT)
505657bc
PB
6250 goto out;
6251 ret = io_poll_cancel(ctx, sqe_addr, false);
6252out:
6253 spin_unlock(&ctx->completion_lock);
6254 return ret;
47f46768
JA
6255}
6256
3529d8c2
JA
6257static int io_async_cancel_prep(struct io_kiocb *req,
6258 const struct io_uring_sqe *sqe)
e977d6d3 6259{
fbf23849 6260 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3 6261 return -EINVAL;
61710e43
DA
6262 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6263 return -EINVAL;
26578cda
PB
6264 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6265 sqe->splice_fd_in)
e977d6d3
JA
6266 return -EINVAL;
6267
fbf23849
JA
6268 req->cancel.addr = READ_ONCE(sqe->addr);
6269 return 0;
6270}
6271
61e98203 6272static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6273{
6274 struct io_ring_ctx *ctx = req->ctx;
58f99373
PB
6275 u64 sqe_addr = req->cancel.addr;
6276 struct io_tctx_node *node;
6277 int ret;
6278
8cb01fac 6279 ret = io_try_cancel_userdata(req, sqe_addr);
58f99373
PB
6280 if (ret != -ENOENT)
6281 goto done;
58f99373
PB
6282
6283 /* slow path, try all io-wq's */
6284 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6285 ret = -ENOENT;
6286 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6287 struct io_uring_task *tctx = node->task->io_uring;
fbf23849 6288
58f99373
PB
6289 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6290 if (ret != -ENOENT)
6291 break;
6292 }
6293 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
58f99373 6294done:
58f99373 6295 if (ret < 0)
93d2bcd2 6296 req_set_fail(req);
505657bc 6297 io_req_complete_post(req, ret, 0);
5262f567
JA
6298 return 0;
6299}
6300
269bbe5f 6301static int io_rsrc_update_prep(struct io_kiocb *req,
05f3fb3c
JA
6302 const struct io_uring_sqe *sqe)
6303{
61710e43
DA
6304 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6305 return -EINVAL;
26578cda 6306 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
05f3fb3c
JA
6307 return -EINVAL;
6308
269bbe5f
BM
6309 req->rsrc_update.offset = READ_ONCE(sqe->off);
6310 req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6311 if (!req->rsrc_update.nr_args)
05f3fb3c 6312 return -EINVAL;
269bbe5f 6313 req->rsrc_update.arg = READ_ONCE(sqe->addr);
05f3fb3c
JA
6314 return 0;
6315}
6316
889fca73 6317static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6318{
6319 struct io_ring_ctx *ctx = req->ctx;
c3bdad02 6320 struct io_uring_rsrc_update2 up;
05f3fb3c 6321 int ret;
fbf23849 6322
269bbe5f
BM
6323 up.offset = req->rsrc_update.offset;
6324 up.data = req->rsrc_update.arg;
c3bdad02
PB
6325 up.nr = 0;
6326 up.tags = 0;
615cee49 6327 up.resv = 0;
05f3fb3c 6328
cdb31c29 6329 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
fdecb662 6330 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
98f0b3b4 6331 &up, req->rsrc_update.nr_args);
cdb31c29 6332 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
05f3fb3c
JA
6333
6334 if (ret < 0)
93d2bcd2 6335 req_set_fail(req);
889fca73 6336 __io_req_complete(req, issue_flags, ret, 0);
5262f567
JA
6337 return 0;
6338}
6339
bfe76559 6340static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 6341{
d625c6ee 6342 switch (req->opcode) {
e781573e 6343 case IORING_OP_NOP:
bfe76559 6344 return 0;
f67676d1
JA
6345 case IORING_OP_READV:
6346 case IORING_OP_READ_FIXED:
3a6820f2 6347 case IORING_OP_READ:
bfe76559 6348 return io_read_prep(req, sqe);
f67676d1
JA
6349 case IORING_OP_WRITEV:
6350 case IORING_OP_WRITE_FIXED:
3a6820f2 6351 case IORING_OP_WRITE:
bfe76559 6352 return io_write_prep(req, sqe);
0969e783 6353 case IORING_OP_POLL_ADD:
bfe76559 6354 return io_poll_add_prep(req, sqe);
0969e783 6355 case IORING_OP_POLL_REMOVE:
c5de0036 6356 return io_poll_update_prep(req, sqe);
8ed8d3c3 6357 case IORING_OP_FSYNC:
1155c76a 6358 return io_fsync_prep(req, sqe);
8ed8d3c3 6359 case IORING_OP_SYNC_FILE_RANGE:
1155c76a 6360 return io_sfr_prep(req, sqe);
03b1230c 6361 case IORING_OP_SENDMSG:
fddaface 6362 case IORING_OP_SEND:
bfe76559 6363 return io_sendmsg_prep(req, sqe);
03b1230c 6364 case IORING_OP_RECVMSG:
fddaface 6365 case IORING_OP_RECV:
bfe76559 6366 return io_recvmsg_prep(req, sqe);
f499a021 6367 case IORING_OP_CONNECT:
bfe76559 6368 return io_connect_prep(req, sqe);
2d28390a 6369 case IORING_OP_TIMEOUT:
bfe76559 6370 return io_timeout_prep(req, sqe, false);
b29472ee 6371 case IORING_OP_TIMEOUT_REMOVE:
bfe76559 6372 return io_timeout_remove_prep(req, sqe);
fbf23849 6373 case IORING_OP_ASYNC_CANCEL:
bfe76559 6374 return io_async_cancel_prep(req, sqe);
2d28390a 6375 case IORING_OP_LINK_TIMEOUT:
bfe76559 6376 return io_timeout_prep(req, sqe, true);
8ed8d3c3 6377 case IORING_OP_ACCEPT:
bfe76559 6378 return io_accept_prep(req, sqe);
d63d1b5e 6379 case IORING_OP_FALLOCATE:
bfe76559 6380 return io_fallocate_prep(req, sqe);
15b71abe 6381 case IORING_OP_OPENAT:
bfe76559 6382 return io_openat_prep(req, sqe);
b5dba59e 6383 case IORING_OP_CLOSE:
bfe76559 6384 return io_close_prep(req, sqe);
05f3fb3c 6385 case IORING_OP_FILES_UPDATE:
269bbe5f 6386 return io_rsrc_update_prep(req, sqe);
eddc7ef5 6387 case IORING_OP_STATX:
bfe76559 6388 return io_statx_prep(req, sqe);
4840e418 6389 case IORING_OP_FADVISE:
bfe76559 6390 return io_fadvise_prep(req, sqe);
c1ca757b 6391 case IORING_OP_MADVISE:
bfe76559 6392 return io_madvise_prep(req, sqe);
cebdb986 6393 case IORING_OP_OPENAT2:
bfe76559 6394 return io_openat2_prep(req, sqe);
3e4827b0 6395 case IORING_OP_EPOLL_CTL:
bfe76559 6396 return io_epoll_ctl_prep(req, sqe);
7d67af2c 6397 case IORING_OP_SPLICE:
bfe76559 6398 return io_splice_prep(req, sqe);
ddf0322d 6399 case IORING_OP_PROVIDE_BUFFERS:
bfe76559 6400 return io_provide_buffers_prep(req, sqe);
067524e9 6401 case IORING_OP_REMOVE_BUFFERS:
bfe76559 6402 return io_remove_buffers_prep(req, sqe);
f2a8d5c7 6403 case IORING_OP_TEE:
bfe76559 6404 return io_tee_prep(req, sqe);
36f4fa68
JA
6405 case IORING_OP_SHUTDOWN:
6406 return io_shutdown_prep(req, sqe);
80a261fd
JA
6407 case IORING_OP_RENAMEAT:
6408 return io_renameat_prep(req, sqe);
14a1143b
JA
6409 case IORING_OP_UNLINKAT:
6410 return io_unlinkat_prep(req, sqe);
e34a02dc
DK
6411 case IORING_OP_MKDIRAT:
6412 return io_mkdirat_prep(req, sqe);
7a8721f8
DK
6413 case IORING_OP_SYMLINKAT:
6414 return io_symlinkat_prep(req, sqe);
cf30da90
DK
6415 case IORING_OP_LINKAT:
6416 return io_linkat_prep(req, sqe);
f67676d1
JA
6417 }
6418
bfe76559
PB
6419 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6420 req->opcode);
bd54b6fe 6421 return -EINVAL;
bfe76559
PB
6422}
6423
93642ef8 6424static int io_req_prep_async(struct io_kiocb *req)
bfe76559 6425{
b7e298d2
PB
6426 if (!io_op_defs[req->opcode].needs_async_setup)
6427 return 0;
d886e185 6428 if (WARN_ON_ONCE(req_has_async_data(req)))
b7e298d2
PB
6429 return -EFAULT;
6430 if (io_alloc_async_data(req))
6431 return -EAGAIN;
6432
93642ef8
PB
6433 switch (req->opcode) {
6434 case IORING_OP_READV:
93642ef8
PB
6435 return io_rw_prep_async(req, READ);
6436 case IORING_OP_WRITEV:
93642ef8
PB
6437 return io_rw_prep_async(req, WRITE);
6438 case IORING_OP_SENDMSG:
93642ef8
PB
6439 return io_sendmsg_prep_async(req);
6440 case IORING_OP_RECVMSG:
93642ef8
PB
6441 return io_recvmsg_prep_async(req);
6442 case IORING_OP_CONNECT:
6443 return io_connect_prep_async(req);
6444 }
b7e298d2
PB
6445 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6446 req->opcode);
6447 return -EFAULT;
f67676d1
JA
6448}
6449
9cf7c104
PB
6450static u32 io_get_sequence(struct io_kiocb *req)
6451{
a3dbdf54 6452 u32 seq = req->ctx->cached_sq_head;
9cf7c104 6453
a3dbdf54
PB
6454 /* need original cached_sq_head, but it was increased for each req */
6455 io_for_each_link(req, req)
6456 seq--;
6457 return seq;
9cf7c104
PB
6458}
6459
c072481d 6460static __cold void io_drain_req(struct io_kiocb *req)
de0617e4 6461{
a197f664 6462 struct io_ring_ctx *ctx = req->ctx;
27dc8338 6463 struct io_defer_entry *de;
f67676d1 6464 int ret;
e0eb71dc 6465 u32 seq = io_get_sequence(req);
de0617e4 6466
9d858b21 6467 /* Still need defer if there is pending req in defer list. */
5e371265 6468 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
e0eb71dc 6469queue:
5e371265 6470 ctx->drain_active = false;
e0eb71dc
PB
6471 io_req_task_queue(req);
6472 return;
5e371265 6473 }
de0617e4 6474
b7e298d2 6475 ret = io_req_prep_async(req);
e0eb71dc
PB
6476 if (ret) {
6477fail:
6478 io_req_complete_failed(req, ret);
6479 return;
6480 }
cbdcb435 6481 io_prep_async_link(req);
27dc8338 6482 de = kmalloc(sizeof(*de), GFP_KERNEL);
76cc33d7 6483 if (!de) {
1b48773f 6484 ret = -ENOMEM;
e0eb71dc 6485 goto fail;
76cc33d7 6486 }
2d28390a 6487
79ebeaee 6488 spin_lock(&ctx->completion_lock);
9cf7c104 6489 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
79ebeaee 6490 spin_unlock(&ctx->completion_lock);
27dc8338 6491 kfree(de);
e0eb71dc 6492 goto queue;
de0617e4
JA
6493 }
6494
915967f6 6495 trace_io_uring_defer(ctx, req, req->user_data);
27dc8338 6496 de->req = req;
9cf7c104 6497 de->seq = seq;
27dc8338 6498 list_add_tail(&de->list, &ctx->defer_list);
79ebeaee 6499 spin_unlock(&ctx->completion_lock);
de0617e4
JA
6500}
6501
68fb8979 6502static void io_clean_op(struct io_kiocb *req)
99bc4c38 6503{
0e1b6fe3 6504 if (req->flags & REQ_F_BUFFER_SELECTED) {
30d51dd4
PB
6505 kfree(req->kbuf);
6506 req->kbuf = NULL;
99bc4c38
PB
6507 }
6508
0e1b6fe3
PB
6509 if (req->flags & REQ_F_NEED_CLEANUP) {
6510 switch (req->opcode) {
6511 case IORING_OP_READV:
6512 case IORING_OP_READ_FIXED:
6513 case IORING_OP_READ:
6514 case IORING_OP_WRITEV:
6515 case IORING_OP_WRITE_FIXED:
e8c2bc1f
JA
6516 case IORING_OP_WRITE: {
6517 struct io_async_rw *io = req->async_data;
1dacb4df
PB
6518
6519 kfree(io->free_iovec);
0e1b6fe3 6520 break;
e8c2bc1f 6521 }
0e1b6fe3 6522 case IORING_OP_RECVMSG:
e8c2bc1f
JA
6523 case IORING_OP_SENDMSG: {
6524 struct io_async_msghdr *io = req->async_data;
257e84a5
PB
6525
6526 kfree(io->free_iov);
0e1b6fe3 6527 break;
e8c2bc1f 6528 }
0e1b6fe3
PB
6529 case IORING_OP_SPLICE:
6530 case IORING_OP_TEE:
e1d767f0
PB
6531 if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
6532 io_put_file(req->splice.file_in);
0e1b6fe3 6533 break;
f3cd4850
JA
6534 case IORING_OP_OPENAT:
6535 case IORING_OP_OPENAT2:
6536 if (req->open.filename)
6537 putname(req->open.filename);
6538 break;
80a261fd
JA
6539 case IORING_OP_RENAMEAT:
6540 putname(req->rename.oldpath);
6541 putname(req->rename.newpath);
6542 break;
14a1143b
JA
6543 case IORING_OP_UNLINKAT:
6544 putname(req->unlink.filename);
6545 break;
e34a02dc
DK
6546 case IORING_OP_MKDIRAT:
6547 putname(req->mkdir.filename);
6548 break;
7a8721f8
DK
6549 case IORING_OP_SYMLINKAT:
6550 putname(req->symlink.oldpath);
6551 putname(req->symlink.newpath);
6552 break;
cf30da90
DK
6553 case IORING_OP_LINKAT:
6554 putname(req->hardlink.oldpath);
6555 putname(req->hardlink.newpath);
6556 break;
0e1b6fe3 6557 }
99bc4c38 6558 }
75652a30
JA
6559 if ((req->flags & REQ_F_POLLED) && req->apoll) {
6560 kfree(req->apoll->double_poll);
6561 kfree(req->apoll);
6562 req->apoll = NULL;
6563 }
3a0a6902
PB
6564 if (req->flags & REQ_F_INFLIGHT) {
6565 struct io_uring_task *tctx = req->task->io_uring;
6566
6567 atomic_dec(&tctx->inflight_tracked);
3a0a6902 6568 }
c854357b 6569 if (req->flags & REQ_F_CREDS)
b8e64b53 6570 put_cred(req->creds);
d886e185
PB
6571 if (req->flags & REQ_F_ASYNC_DATA) {
6572 kfree(req->async_data);
6573 req->async_data = NULL;
6574 }
c854357b 6575 req->flags &= ~IO_REQ_CLEAN_FLAGS;
99bc4c38
PB
6576}
6577
889fca73 6578static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 6579{
a197f664 6580 struct io_ring_ctx *ctx = req->ctx;
5730b27e 6581 const struct cred *creds = NULL;
d625c6ee 6582 int ret;
2b188cc1 6583
6878b40e 6584 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
c10d1f98 6585 creds = override_creds(req->creds);
5730b27e 6586
d625c6ee 6587 switch (req->opcode) {
2b188cc1 6588 case IORING_OP_NOP:
889fca73 6589 ret = io_nop(req, issue_flags);
2b188cc1
JA
6590 break;
6591 case IORING_OP_READV:
edafccee 6592 case IORING_OP_READ_FIXED:
3a6820f2 6593 case IORING_OP_READ:
889fca73 6594 ret = io_read(req, issue_flags);
edafccee 6595 break;
3529d8c2 6596 case IORING_OP_WRITEV:
edafccee 6597 case IORING_OP_WRITE_FIXED:
3a6820f2 6598 case IORING_OP_WRITE:
889fca73 6599 ret = io_write(req, issue_flags);
2b188cc1 6600 break;
c992fe29 6601 case IORING_OP_FSYNC:
45d189c6 6602 ret = io_fsync(req, issue_flags);
c992fe29 6603 break;
221c5eb2 6604 case IORING_OP_POLL_ADD:
61e98203 6605 ret = io_poll_add(req, issue_flags);
221c5eb2
JA
6606 break;
6607 case IORING_OP_POLL_REMOVE:
c5de0036 6608 ret = io_poll_update(req, issue_flags);
221c5eb2 6609 break;
5d17b4a4 6610 case IORING_OP_SYNC_FILE_RANGE:
45d189c6 6611 ret = io_sync_file_range(req, issue_flags);
5d17b4a4 6612 break;
0fa03c62 6613 case IORING_OP_SENDMSG:
889fca73 6614 ret = io_sendmsg(req, issue_flags);
062d04d7 6615 break;
fddaface 6616 case IORING_OP_SEND:
889fca73 6617 ret = io_send(req, issue_flags);
0fa03c62 6618 break;
aa1fa28f 6619 case IORING_OP_RECVMSG:
889fca73 6620 ret = io_recvmsg(req, issue_flags);
062d04d7 6621 break;
fddaface 6622 case IORING_OP_RECV:
889fca73 6623 ret = io_recv(req, issue_flags);
aa1fa28f 6624 break;
5262f567 6625 case IORING_OP_TIMEOUT:
61e98203 6626 ret = io_timeout(req, issue_flags);
5262f567 6627 break;
11365043 6628 case IORING_OP_TIMEOUT_REMOVE:
61e98203 6629 ret = io_timeout_remove(req, issue_flags);
11365043 6630 break;
17f2fe35 6631 case IORING_OP_ACCEPT:
889fca73 6632 ret = io_accept(req, issue_flags);
17f2fe35 6633 break;
f8e85cf2 6634 case IORING_OP_CONNECT:
889fca73 6635 ret = io_connect(req, issue_flags);
f8e85cf2 6636 break;
62755e35 6637 case IORING_OP_ASYNC_CANCEL:
61e98203 6638 ret = io_async_cancel(req, issue_flags);
62755e35 6639 break;
d63d1b5e 6640 case IORING_OP_FALLOCATE:
45d189c6 6641 ret = io_fallocate(req, issue_flags);
d63d1b5e 6642 break;
15b71abe 6643 case IORING_OP_OPENAT:
45d189c6 6644 ret = io_openat(req, issue_flags);
15b71abe 6645 break;
b5dba59e 6646 case IORING_OP_CLOSE:
889fca73 6647 ret = io_close(req, issue_flags);
b5dba59e 6648 break;
05f3fb3c 6649 case IORING_OP_FILES_UPDATE:
889fca73 6650 ret = io_files_update(req, issue_flags);
05f3fb3c 6651 break;
eddc7ef5 6652 case IORING_OP_STATX:
45d189c6 6653 ret = io_statx(req, issue_flags);
eddc7ef5 6654 break;
4840e418 6655 case IORING_OP_FADVISE:
45d189c6 6656 ret = io_fadvise(req, issue_flags);
4840e418 6657 break;
c1ca757b 6658 case IORING_OP_MADVISE:
45d189c6 6659 ret = io_madvise(req, issue_flags);
c1ca757b 6660 break;
cebdb986 6661 case IORING_OP_OPENAT2:
45d189c6 6662 ret = io_openat2(req, issue_flags);
cebdb986 6663 break;
3e4827b0 6664 case IORING_OP_EPOLL_CTL:
889fca73 6665 ret = io_epoll_ctl(req, issue_flags);
3e4827b0 6666 break;
7d67af2c 6667 case IORING_OP_SPLICE:
45d189c6 6668 ret = io_splice(req, issue_flags);
7d67af2c 6669 break;
ddf0322d 6670 case IORING_OP_PROVIDE_BUFFERS:
889fca73 6671 ret = io_provide_buffers(req, issue_flags);
ddf0322d 6672 break;
067524e9 6673 case IORING_OP_REMOVE_BUFFERS:
889fca73 6674 ret = io_remove_buffers(req, issue_flags);
3e4827b0 6675 break;
f2a8d5c7 6676 case IORING_OP_TEE:
45d189c6 6677 ret = io_tee(req, issue_flags);
f2a8d5c7 6678 break;
36f4fa68 6679 case IORING_OP_SHUTDOWN:
45d189c6 6680 ret = io_shutdown(req, issue_flags);
36f4fa68 6681 break;
80a261fd 6682 case IORING_OP_RENAMEAT:
45d189c6 6683 ret = io_renameat(req, issue_flags);
80a261fd 6684 break;
14a1143b 6685 case IORING_OP_UNLINKAT:
45d189c6 6686 ret = io_unlinkat(req, issue_flags);
14a1143b 6687 break;
e34a02dc
DK
6688 case IORING_OP_MKDIRAT:
6689 ret = io_mkdirat(req, issue_flags);
6690 break;
7a8721f8
DK
6691 case IORING_OP_SYMLINKAT:
6692 ret = io_symlinkat(req, issue_flags);
6693 break;
cf30da90
DK
6694 case IORING_OP_LINKAT:
6695 ret = io_linkat(req, issue_flags);
6696 break;
2b188cc1
JA
6697 default:
6698 ret = -EINVAL;
6699 break;
6700 }
6701
5730b27e
JA
6702 if (creds)
6703 revert_creds(creds);
def596e9
JA
6704 if (ret)
6705 return ret;
b532576e 6706 /* If the op doesn't have a file, we're not polling for it */
cb3d8972
PB
6707 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
6708 io_iopoll_req_issued(req);
def596e9
JA
6709
6710 return 0;
2b188cc1
JA
6711}
6712
ebc11b6c
PB
6713static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
6714{
6715 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6716
6717 req = io_put_req_find_next(req);
6718 return req ? &req->work : NULL;
6719}
6720
5280f7e5 6721static void io_wq_submit_work(struct io_wq_work *work)
2b188cc1
JA
6722{
6723 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6df1db6b 6724 struct io_kiocb *timeout;
561fb04a 6725 int ret = 0;
2b188cc1 6726
48dcd38d
PB
6727 /* one will be dropped by ->io_free_work() after returning to io-wq */
6728 if (!(req->flags & REQ_F_REFCOUNT))
6729 __io_req_set_refcount(req, 2);
6730 else
6731 req_ref_get(req);
5d5901a3 6732
6df1db6b
PB
6733 timeout = io_prep_linked_timeout(req);
6734 if (timeout)
6735 io_queue_linked_timeout(timeout);
d4c81f38 6736
dadebc35 6737 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
4014d943 6738 if (work->flags & IO_WQ_WORK_CANCEL)
561fb04a 6739 ret = -ECANCELED;
31b51510 6740
561fb04a 6741 if (!ret) {
561fb04a 6742 do {
889fca73 6743 ret = io_issue_sqe(req, 0);
561fb04a
JA
6744 /*
6745 * We can get EAGAIN for polled IO even though we're
6746 * forcing a sync submission from here, since we can't
6747 * wait for request slots on the block side.
6748 */
6749 if (ret != -EAGAIN)
6750 break;
6751 cond_resched();
6752 } while (1);
6753 }
31b51510 6754
a3df7698 6755 /* avoid locking problems by failing it from a clean context */
5d5901a3 6756 if (ret)
a3df7698 6757 io_req_task_queue_fail(req, ret);
2b188cc1
JA
6758}
6759
aeca241b 6760static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
042b0d85 6761 unsigned i)
65e19f54 6762{
042b0d85 6763 return &table->files[i];
dafecf19
PB
6764}
6765
65e19f54
JA
6766static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6767 int index)
6768{
aeca241b 6769 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
65e19f54 6770
a04b0ac0 6771 return (struct file *) (slot->file_ptr & FFS_MASK);
65e19f54
JA
6772}
6773
a04b0ac0 6774static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
9a321c98
PB
6775{
6776 unsigned long file_ptr = (unsigned long) file;
6777
b191e2df 6778 if (__io_file_supports_nowait(file, READ))
9a321c98 6779 file_ptr |= FFS_ASYNC_READ;
b191e2df 6780 if (__io_file_supports_nowait(file, WRITE))
9a321c98
PB
6781 file_ptr |= FFS_ASYNC_WRITE;
6782 if (S_ISREG(file_inode(file)->i_mode))
6783 file_ptr |= FFS_ISREG;
a04b0ac0 6784 file_slot->file_ptr = file_ptr;
65e19f54
JA
6785}
6786
ac177053
PB
6787static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
6788 struct io_kiocb *req, int fd)
09bb8394 6789{
8da11c19 6790 struct file *file;
ac177053 6791 unsigned long file_ptr;
09bb8394 6792
ac177053
PB
6793 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6794 return NULL;
6795 fd = array_index_nospec(fd, ctx->nr_user_files);
6796 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
6797 file = (struct file *) (file_ptr & FFS_MASK);
6798 file_ptr &= ~FFS_MASK;
6799 /* mask in overlapping REQ_F and FFS bits */
b191e2df 6800 req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
a46be971 6801 io_req_set_rsrc_node(req, ctx);
ac177053
PB
6802 return file;
6803}
d44f554e 6804
ac177053 6805static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
ac177053
PB
6806 struct io_kiocb *req, int fd)
6807{
62906e89 6808 struct file *file = fget(fd);
ac177053
PB
6809
6810 trace_io_uring_file_get(ctx, fd);
09bb8394 6811
ac177053
PB
6812 /* we don't allow fixed io_uring files */
6813 if (file && unlikely(file->f_op == &io_uring_fops))
6814 io_req_track_inflight(req);
8371adf5 6815 return file;
09bb8394
JA
6816}
6817
ac177053 6818static inline struct file *io_file_get(struct io_ring_ctx *ctx,
ac177053
PB
6819 struct io_kiocb *req, int fd, bool fixed)
6820{
6821 if (fixed)
6822 return io_file_get_fixed(ctx, req, fd);
6823 else
62906e89 6824 return io_file_get_normal(ctx, req, fd);
ac177053
PB
6825}
6826
f237c30a 6827static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
89b263f6
JA
6828{
6829 struct io_kiocb *prev = req->timeout.prev;
8cb01fac 6830 int ret;
89b263f6
JA
6831
6832 if (prev) {
8cb01fac 6833 ret = io_try_cancel_userdata(req, prev->user_data);
505657bc 6834 io_req_complete_post(req, ret ?: -ETIME, 0);
89b263f6 6835 io_put_req(prev);
89b263f6
JA
6836 } else {
6837 io_req_complete_post(req, -ETIME, 0);
6838 }
6839}
6840
2665abfd 6841static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 6842{
ad8a48ac
JA
6843 struct io_timeout_data *data = container_of(timer,
6844 struct io_timeout_data, timer);
90cd7e42 6845 struct io_kiocb *prev, *req = data->req;
2665abfd 6846 struct io_ring_ctx *ctx = req->ctx;
2665abfd 6847 unsigned long flags;
2665abfd 6848
89b263f6 6849 spin_lock_irqsave(&ctx->timeout_lock, flags);
90cd7e42
PB
6850 prev = req->timeout.head;
6851 req->timeout.head = NULL;
2665abfd
JA
6852
6853 /*
6854 * We don't expect the list to be empty, that will only happen if we
6855 * race with the completion of the linked work.
6856 */
447c19f3 6857 if (prev) {
f2f87370 6858 io_remove_next_linked(prev);
447c19f3
PB
6859 if (!req_ref_inc_not_zero(prev))
6860 prev = NULL;
6861 }
ef9dd637 6862 list_del(&req->timeout.list);
89b263f6
JA
6863 req->timeout.prev = prev;
6864 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
2665abfd 6865
89b263f6
JA
6866 req->io_task_work.func = io_req_task_link_timeout;
6867 io_req_task_work_add(req);
2665abfd
JA
6868 return HRTIMER_NORESTART;
6869}
6870
de968c18 6871static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 6872{
de968c18
PB
6873 struct io_ring_ctx *ctx = req->ctx;
6874
89b263f6 6875 spin_lock_irq(&ctx->timeout_lock);
76a46e06 6876 /*
f2f87370
PB
6877 * If the back reference is NULL, then our linked request finished
6878 * before we got a chance to setup the timer
76a46e06 6879 */
90cd7e42 6880 if (req->timeout.head) {
e8c2bc1f 6881 struct io_timeout_data *data = req->async_data;
94ae5e77 6882
ad8a48ac
JA
6883 data->timer.function = io_link_timeout_fn;
6884 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6885 data->mode);
ef9dd637 6886 list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
2665abfd 6887 }
89b263f6 6888 spin_unlock_irq(&ctx->timeout_lock);
2665abfd 6889 /* drop submission reference */
76a46e06
JA
6890 io_put_req(req);
6891}
2665abfd 6892
d475a9a6
PB
6893static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
6894 __must_hold(&req->ctx->uring_lock)
6895{
6896 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6897
6898 switch (io_arm_poll_handler(req)) {
6899 case IO_APOLL_READY:
6900 if (linked_timeout) {
6901 io_unprep_linked_timeout(req);
6902 linked_timeout = NULL;
6903 }
6904 io_req_task_queue(req);
6905 break;
6906 case IO_APOLL_ABORTED:
6907 /*
6908 * Queued up for async execution, worker will release
6909 * submit reference when the iocb is actually submitted.
6910 */
6911 io_queue_async_work(req, NULL);
6912 break;
6913 }
6914
6915 if (linked_timeout)
6916 io_queue_linked_timeout(linked_timeout);
6917}
6918
6919static inline void __io_queue_sqe(struct io_kiocb *req)
282cdc86 6920 __must_hold(&req->ctx->uring_lock)
2b188cc1 6921{
906c6caa 6922 struct io_kiocb *linked_timeout;
e0c5c576 6923 int ret;
2b188cc1 6924
c5eef2b9 6925 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
193155c8 6926
fff4e40e
PB
6927 if (req->flags & REQ_F_COMPLETE_INLINE) {
6928 io_req_add_compl_list(req);
d9f9d284 6929 return;
fff4e40e 6930 }
491381ce
JA
6931 /*
6932 * We async punt it if the file wasn't marked NOWAIT, or if the file
6933 * doesn't support non-blocking read/write attempts
6934 */
1840038e 6935 if (likely(!ret)) {
906c6caa
PB
6936 linked_timeout = io_prep_linked_timeout(req);
6937 if (linked_timeout)
6938 io_queue_linked_timeout(linked_timeout);
1840038e 6939 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
d475a9a6 6940 io_queue_sqe_arm_apoll(req);
0d63c148 6941 } else {
f41db273 6942 io_req_complete_failed(req, ret);
9e645e11 6943 }
2b188cc1
JA
6944}
6945
4652fe3f 6946static void io_queue_sqe_fallback(struct io_kiocb *req)
282cdc86 6947 __must_hold(&req->ctx->uring_lock)
4fe2c963 6948{
4652fe3f 6949 if (req->flags & REQ_F_FAIL) {
c6d3d9cb 6950 io_req_complete_fail_submit(req);
e0eb71dc
PB
6951 } else if (unlikely(req->ctx->drain_active)) {
6952 io_drain_req(req);
76cc33d7
PB
6953 } else {
6954 int ret = io_req_prep_async(req);
6955
6956 if (unlikely(ret))
6957 io_req_complete_failed(req, ret);
6958 else
f237c30a 6959 io_queue_async_work(req, NULL);
ce35a47a 6960 }
4fe2c963
JL
6961}
6962
4652fe3f
PB
6963static inline void io_queue_sqe(struct io_kiocb *req)
6964 __must_hold(&req->ctx->uring_lock)
6965{
6966 if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
6967 __io_queue_sqe(req);
6968 else
6969 io_queue_sqe_fallback(req);
6970}
6971
b16fed66
PB
6972/*
6973 * Check SQE restrictions (opcode and flags).
6974 *
6975 * Returns 'true' if SQE is allowed, 'false' otherwise.
6976 */
6977static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6978 struct io_kiocb *req,
6979 unsigned int sqe_flags)
4fe2c963 6980{
b16fed66
PB
6981 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6982 return false;
6983
6984 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6985 ctx->restrictions.sqe_flags_required)
6986 return false;
6987
6988 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6989 ctx->restrictions.sqe_flags_required))
6990 return false;
6991
6992 return true;
4fe2c963
JL
6993}
6994
22b2ca31
PB
6995static void io_init_req_drain(struct io_kiocb *req)
6996{
6997 struct io_ring_ctx *ctx = req->ctx;
6998 struct io_kiocb *head = ctx->submit_state.link.head;
6999
7000 ctx->drain_active = true;
7001 if (head) {
7002 /*
7003 * If we need to drain a request in the middle of a link, drain
7004 * the head request and the next request/link after the current
7005 * link. Considering sequential execution of links,
7006 * IOSQE_IO_DRAIN will be maintained for every request of our
7007 * link.
7008 */
7009 head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
7010 ctx->drain_next = true;
7011 }
7012}
7013
b16fed66
PB
7014static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
7015 const struct io_uring_sqe *sqe)
282cdc86 7016 __must_hold(&ctx->uring_lock)
b16fed66 7017{
b16fed66 7018 unsigned int sqe_flags;
fc0ae024 7019 int personality;
4a04d1d1 7020 u8 opcode;
b16fed66 7021
864ea921 7022 /* req is partially pre-initialised, see io_preinit_req() */
4a04d1d1 7023 req->opcode = opcode = READ_ONCE(sqe->opcode);
b16fed66
PB
7024 /* same numerical values with corresponding REQ_F_*, safe to copy */
7025 req->flags = sqe_flags = READ_ONCE(sqe->flags);
7026 req->user_data = READ_ONCE(sqe->user_data);
b16fed66 7027 req->file = NULL;
b16fed66 7028 req->fixed_rsrc_refs = NULL;
b16fed66 7029 req->task = current;
b16fed66 7030
4a04d1d1
PB
7031 if (unlikely(opcode >= IORING_OP_LAST)) {
7032 req->opcode = 0;
b16fed66 7033 return -EINVAL;
4a04d1d1 7034 }
68fe256a
PB
7035 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
7036 /* enforce forwards compatibility on users */
7037 if (sqe_flags & ~SQE_VALID_FLAGS)
7038 return -EINVAL;
7039 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
4a04d1d1 7040 !io_op_defs[opcode].buffer_select)
68fe256a 7041 return -EOPNOTSUPP;
22b2ca31
PB
7042 if (sqe_flags & IOSQE_IO_DRAIN)
7043 io_init_req_drain(req);
2a56a9bd
PB
7044 }
7045 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
7046 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
7047 return -EACCES;
7048 /* knock it to the slow queue path, will be drained there */
7049 if (ctx->drain_active)
7050 req->flags |= REQ_F_FORCE_ASYNC;
7051 /* if there is no link, we're at "next" request and need to drain */
7052 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
7053 ctx->drain_next = false;
7054 ctx->drain_active = true;
22b2ca31 7055 req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
2a56a9bd 7056 }
68fe256a 7057 }
b16fed66 7058
4a04d1d1 7059 if (io_op_defs[opcode].needs_file) {
6d63416d
PB
7060 struct io_submit_state *state = &ctx->submit_state;
7061
7062 /*
7063 * Plug now if we have more than 2 IO left after this, and the
7064 * target is potentially a read/write to block based storage.
7065 */
4a04d1d1 7066 if (state->need_plug && io_op_defs[opcode].plug) {
6d63416d
PB
7067 state->plug_started = true;
7068 state->need_plug = false;
7069 blk_start_plug(&state->plug);
7070 }
7071
62906e89 7072 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
ac177053 7073 (sqe_flags & IOSQE_FIXED_FILE));
b16fed66 7074 if (unlikely(!req->file))
fc0ae024 7075 return -EBADF;
b16fed66 7076 }
fc0ae024 7077
4a04d1d1
PB
7078 personality = READ_ONCE(sqe->personality);
7079 if (personality) {
7080 req->creds = xa_load(&ctx->personalities, personality);
7081 if (!req->creds)
7082 return -EINVAL;
7083 get_cred(req->creds);
7084 req->flags |= REQ_F_CREDS;
7085 }
7086
fc0ae024 7087 return io_req_prep(req, sqe);
b16fed66
PB
7088}
7089
a6b8cadc 7090static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
a1ab7b35 7091 const struct io_uring_sqe *sqe)
282cdc86 7092 __must_hold(&ctx->uring_lock)
9e645e11 7093{
a1ab7b35 7094 struct io_submit_link *link = &ctx->submit_state.link;
ef4ff581 7095 int ret;
9e645e11 7096
a6b8cadc
PB
7097 ret = io_init_req(ctx, req, sqe);
7098 if (unlikely(ret)) {
a87acfde
JA
7099 trace_io_uring_req_failed(sqe, ret);
7100
a8295b98 7101 /* fail even hard links since we don't submit */
de59bc10 7102 if (link->head) {
a8295b98
HX
7103 /*
7104 * we can judge a link req is failed or cancelled by if
7105 * REQ_F_FAIL is set, but the head is an exception since
7106 * it may be set REQ_F_FAIL because of other req's failure
7107 * so let's leverage req->result to distinguish if a head
7108 * is set REQ_F_FAIL because of its failure or other req's
7109 * failure so that we can set the correct ret code for it.
7110 * init result here to avoid affecting the normal path.
7111 */
7112 if (!(link->head->flags & REQ_F_FAIL))
7113 req_fail_link_node(link->head, -ECANCELED);
7114 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
7115 /*
7116 * the current req is a normal req, we should return
7117 * error and thus break the submittion loop.
7118 */
7119 io_req_complete_failed(req, ret);
7120 return ret;
de59bc10 7121 }
a8295b98 7122 req_fail_link_node(req, ret);
a6b8cadc 7123 }
441b8a78 7124
be7053b7 7125 /* don't need @sqe from now on */
236daeae
OL
7126 trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
7127 req->flags, true,
7128 ctx->flags & IORING_SETUP_SQPOLL);
a6b8cadc 7129
9e645e11
JA
7130 /*
7131 * If we already have a head request, queue this one for async
7132 * submittal once the head completes. If we don't have a head but
7133 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7134 * submitted sync once the chain is complete. If none of those
7135 * conditions are true (normal request), then just queue it.
7136 */
863e0560
PB
7137 if (link->head) {
7138 struct io_kiocb *head = link->head;
4e88d6e7 7139
a8295b98
HX
7140 if (!(req->flags & REQ_F_FAIL)) {
7141 ret = io_req_prep_async(req);
7142 if (unlikely(ret)) {
7143 req_fail_link_node(req, ret);
7144 if (!(head->flags & REQ_F_FAIL))
7145 req_fail_link_node(head, -ECANCELED);
7146 }
7147 }
9d76377f 7148 trace_io_uring_link(ctx, req, head);
f2f87370 7149 link->last->link = req;
863e0560 7150 link->last = req;
32fe525b 7151
f15a3431
PB
7152 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
7153 return 0;
32fe525b 7154 /* last request of a link, enqueue the link */
f15a3431
PB
7155 link->head = NULL;
7156 req = head;
7157 } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
7158 link->head = req;
7159 link->last = req;
7160 return 0;
9e645e11 7161 }
2e6e1fde 7162
f15a3431 7163 io_queue_sqe(req);
1d4240cc 7164 return 0;
9e645e11
JA
7165}
7166
9a56a232
JA
7167/*
7168 * Batched submission is done, ensure local IO is flushed out.
7169 */
553deffd 7170static void io_submit_state_end(struct io_ring_ctx *ctx)
9a56a232 7171{
553deffd
PB
7172 struct io_submit_state *state = &ctx->submit_state;
7173
a1ab7b35 7174 if (state->link.head)
de59bc10 7175 io_queue_sqe(state->link.head);
553deffd 7176 /* flush only after queuing links as they can generate completions */
c450178d 7177 io_submit_flush_completions(ctx);
27926b68
JA
7178 if (state->plug_started)
7179 blk_finish_plug(&state->plug);
9a56a232
JA
7180}
7181
7182/*
7183 * Start submission side cache.
7184 */
7185static void io_submit_state_start(struct io_submit_state *state,
ba88ff11 7186 unsigned int max_ios)
9a56a232 7187{
27926b68 7188 state->plug_started = false;
4b628aeb 7189 state->need_plug = max_ios > 2;
a1ab7b35
PB
7190 /* set only head, no need to init link_last in advance */
7191 state->link.head = NULL;
9a56a232
JA
7192}
7193
2b188cc1
JA
7194static void io_commit_sqring(struct io_ring_ctx *ctx)
7195{
75b28aff 7196 struct io_rings *rings = ctx->rings;
2b188cc1 7197
caf582c6
PB
7198 /*
7199 * Ensure any loads from the SQEs are done at this point,
7200 * since once we write the new head, the application could
7201 * write new data to them.
7202 */
7203 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
7204}
7205
2b188cc1 7206/*
dd9ae8a0 7207 * Fetch an sqe, if one is available. Note this returns a pointer to memory
2b188cc1
JA
7208 * that is mapped by userspace. This means that care needs to be taken to
7209 * ensure that reads are stable, as we cannot rely on userspace always
7210 * being a good citizen. If members of the sqe are validated and then later
7211 * used, it's important that those reads are done through READ_ONCE() to
7212 * prevent a re-load down the line.
7213 */
709b302f 7214static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2b188cc1 7215{
ea5ab3b5 7216 unsigned head, mask = ctx->sq_entries - 1;
17d3aeb3 7217 unsigned sq_idx = ctx->cached_sq_head++ & mask;
2b188cc1
JA
7218
7219 /*
7220 * The cached sq head (or cq tail) serves two purposes:
7221 *
7222 * 1) allows us to batch the cost of updating the user visible
7223 * head updates.
7224 * 2) allows the kernel side to track the head on its own, even
7225 * though the application is the one updating it.
7226 */
17d3aeb3 7227 head = READ_ONCE(ctx->sq_array[sq_idx]);
709b302f
PB
7228 if (likely(head < ctx->sq_entries))
7229 return &ctx->sq_sqes[head];
2b188cc1
JA
7230
7231 /* drop invalid entries */
15641e42
PB
7232 ctx->cq_extra--;
7233 WRITE_ONCE(ctx->rings->sq_dropped,
7234 READ_ONCE(ctx->rings->sq_dropped) + 1);
709b302f
PB
7235 return NULL;
7236}
7237
0f212204 7238static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
282cdc86 7239 __must_hold(&ctx->uring_lock)
6c271ce2 7240{
69629809 7241 unsigned int entries = io_sqring_entries(ctx);
46c4e16a 7242 int submitted = 0;
6c271ce2 7243
51d48dab 7244 if (unlikely(!entries))
69629809 7245 return 0;
ee7d46d9 7246 /* make sure SQ entry isn't read before tail */
69629809 7247 nr = min3(nr, ctx->sq_entries, entries);
9a10867a 7248 io_get_task_refs(nr);
6c271ce2 7249
ba88ff11 7250 io_submit_state_start(&ctx->submit_state, nr);
69629809 7251 do {
3529d8c2 7252 const struct io_uring_sqe *sqe;
196be95c 7253 struct io_kiocb *req;
fb5ccc98 7254
a33ae9ce 7255 if (unlikely(!io_alloc_req_refill(ctx))) {
196be95c
PB
7256 if (!submitted)
7257 submitted = -EAGAIN;
fb5ccc98 7258 break;
196be95c 7259 }
a33ae9ce 7260 req = io_alloc_req(ctx);
4fccfcbb
PB
7261 sqe = io_get_sqe(ctx);
7262 if (unlikely(!sqe)) {
c2b6c6bc 7263 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
4fccfcbb
PB
7264 break;
7265 }
d3656344
JA
7266 /* will complete beyond this point, count as submitted */
7267 submitted++;
a1ab7b35 7268 if (io_submit_sqe(ctx, req, sqe))
196be95c 7269 break;
69629809 7270 } while (submitted < nr);
6c271ce2 7271
9466f437
PB
7272 if (unlikely(submitted != nr)) {
7273 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
d8a6df10 7274 int unused = nr - ref_used;
9466f437 7275
09899b19 7276 current->io_uring->cached_refs += unused;
9466f437 7277 }
6c271ce2 7278
553deffd 7279 io_submit_state_end(ctx);
ae9428ca
PB
7280 /* Commit SQ ring head once we've consumed and submitted all SQEs */
7281 io_commit_sqring(ctx);
7282
6c271ce2
JA
7283 return submitted;
7284}
7285
e4b6d902
PB
7286static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7287{
7288 return READ_ONCE(sqd->state);
7289}
7290
23b3628e
XW
7291static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7292{
7293 /* Tell userspace we may need a wakeup call */
79ebeaee 7294 spin_lock(&ctx->completion_lock);
20c0b380
NA
7295 WRITE_ONCE(ctx->rings->sq_flags,
7296 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
79ebeaee 7297 spin_unlock(&ctx->completion_lock);
23b3628e
XW
7298}
7299
7300static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
7301{
79ebeaee 7302 spin_lock(&ctx->completion_lock);
20c0b380
NA
7303 WRITE_ONCE(ctx->rings->sq_flags,
7304 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
79ebeaee 7305 spin_unlock(&ctx->completion_lock);
23b3628e
XW
7306}
7307
08369246 7308static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6c271ce2 7309{
c8d1ba58 7310 unsigned int to_submit;
bdcd3eab 7311 int ret = 0;
6c271ce2 7312
c8d1ba58 7313 to_submit = io_sqring_entries(ctx);
e95eee2d 7314 /* if we're handling multiple rings, cap submit size for fairness */
4ce8ad95
OL
7315 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7316 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
e95eee2d 7317
5eef4e87 7318 if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
948e1947
PB
7319 const struct cred *creds = NULL;
7320
7321 if (ctx->sq_creds != current_cred())
7322 creds = override_creds(ctx->sq_creds);
a4c0b3de 7323
c8d1ba58 7324 mutex_lock(&ctx->uring_lock);
5eef4e87 7325 if (!wq_list_empty(&ctx->iopoll_list))
5ba3c874 7326 io_do_iopoll(ctx, true);
906a3c6f 7327
3b763ba1
PB
7328 /*
7329 * Don't submit if refs are dying, good for io_uring_register(),
7330 * but also it is relied upon by io_ring_exit_work()
7331 */
0298ef96
PB
7332 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7333 !(ctx->flags & IORING_SETUP_R_DISABLED))
08369246 7334 ret = io_submit_sqes(ctx, to_submit);
c8d1ba58 7335 mutex_unlock(&ctx->uring_lock);
6c271ce2 7336
acfb381d
PB
7337 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7338 wake_up(&ctx->sqo_sq_wait);
948e1947
PB
7339 if (creds)
7340 revert_creds(creds);
acfb381d 7341 }
6c271ce2 7342
08369246
XW
7343 return ret;
7344}
6c271ce2 7345
c072481d 7346static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
08369246
XW
7347{
7348 struct io_ring_ctx *ctx;
7349 unsigned sq_thread_idle = 0;
6c271ce2 7350
c9dca27d
PB
7351 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7352 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
08369246 7353 sqd->sq_thread_idle = sq_thread_idle;
c8d1ba58 7354}
6c271ce2 7355
e4b6d902
PB
7356static bool io_sqd_handle_event(struct io_sq_data *sqd)
7357{
7358 bool did_sig = false;
7359 struct ksignal ksig;
7360
7361 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7362 signal_pending(current)) {
7363 mutex_unlock(&sqd->lock);
7364 if (signal_pending(current))
7365 did_sig = get_signal(&ksig);
7366 cond_resched();
7367 mutex_lock(&sqd->lock);
7368 }
e4b6d902
PB
7369 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7370}
7371
c8d1ba58
JA
7372static int io_sq_thread(void *data)
7373{
69fb2131
JA
7374 struct io_sq_data *sqd = data;
7375 struct io_ring_ctx *ctx;
a0d9205f 7376 unsigned long timeout = 0;
37d1e2e3 7377 char buf[TASK_COMM_LEN];
08369246 7378 DEFINE_WAIT(wait);
6c271ce2 7379
696ee88a 7380 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
37d1e2e3 7381 set_task_comm(current, buf);
37d1e2e3
JA
7382
7383 if (sqd->sq_cpu != -1)
7384 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
7385 else
7386 set_cpus_allowed_ptr(current, cpu_online_mask);
7387 current->flags |= PF_NO_SETAFFINITY;
7388
09a6f4ef 7389 mutex_lock(&sqd->lock);
e4b6d902 7390 while (1) {
1a924a80 7391 bool cap_entries, sqt_spin = false;
c1edbf5f 7392
e4b6d902
PB
7393 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
7394 if (io_sqd_handle_event(sqd))
c7d95613 7395 break;
08369246
XW
7396 timeout = jiffies + sqd->sq_thread_idle;
7397 }
e4b6d902 7398
e95eee2d 7399 cap_entries = !list_is_singular(&sqd->ctx_list);
69fb2131 7400 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
948e1947 7401 int ret = __io_sq_thread(ctx, cap_entries);
7c30f36a 7402
5eef4e87 7403 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
08369246 7404 sqt_spin = true;
69fb2131 7405 }
dd432ea5
PB
7406 if (io_run_task_work())
7407 sqt_spin = true;
6c271ce2 7408
08369246 7409 if (sqt_spin || !time_after(jiffies, timeout)) {
c8d1ba58 7410 cond_resched();
08369246
XW
7411 if (sqt_spin)
7412 timeout = jiffies + sqd->sq_thread_idle;
7413 continue;
7414 }
7415
08369246 7416 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
dd432ea5 7417 if (!io_sqd_events_pending(sqd) && !current->task_works) {
1a924a80
PB
7418 bool needs_sched = true;
7419
724cb4f9 7420 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
aaa9f0f4
PB
7421 io_ring_set_wakeup_flag(ctx);
7422
724cb4f9 7423 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
5eef4e87 7424 !wq_list_empty(&ctx->iopoll_list)) {
724cb4f9
HX
7425 needs_sched = false;
7426 break;
7427 }
7428 if (io_sqring_entries(ctx)) {
7429 needs_sched = false;
7430 break;
7431 }
7432 }
7433
7434 if (needs_sched) {
7435 mutex_unlock(&sqd->lock);
7436 schedule();
7437 mutex_lock(&sqd->lock);
7438 }
69fb2131
JA
7439 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7440 io_ring_clear_wakeup_flag(ctx);
6c271ce2 7441 }
08369246
XW
7442
7443 finish_wait(&sqd->wait, &wait);
7444 timeout = jiffies + sqd->sq_thread_idle;
6c271ce2 7445 }
28cea78a 7446
78cc687b 7447 io_uring_cancel_generic(true, sqd);
37d1e2e3 7448 sqd->thread = NULL;
05962f95 7449 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
5f3f26f9 7450 io_ring_set_wakeup_flag(ctx);
521d6a73 7451 io_run_task_work();
734551df
PB
7452 mutex_unlock(&sqd->lock);
7453
37d1e2e3
JA
7454 complete(&sqd->exited);
7455 do_exit(0);
6c271ce2
JA
7456}
7457
bda52162
JA
7458struct io_wait_queue {
7459 struct wait_queue_entry wq;
7460 struct io_ring_ctx *ctx;
5fd46178 7461 unsigned cq_tail;
bda52162
JA
7462 unsigned nr_timeouts;
7463};
7464
6c503150 7465static inline bool io_should_wake(struct io_wait_queue *iowq)
bda52162
JA
7466{
7467 struct io_ring_ctx *ctx = iowq->ctx;
5fd46178 7468 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
bda52162
JA
7469
7470 /*
d195a66e 7471 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
7472 * started waiting. For timeouts, we always want to return to userspace,
7473 * regardless of event count.
7474 */
5fd46178 7475 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
bda52162
JA
7476}
7477
7478static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7479 int wake_flags, void *key)
7480{
7481 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7482 wq);
7483
6c503150
PB
7484 /*
7485 * Cannot safely flush overflowed CQEs from here, ensure we wake up
7486 * the task, and the next invocation will do it.
7487 */
5ed7a37d 7488 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
6c503150
PB
7489 return autoremove_wake_function(curr, mode, wake_flags, key);
7490 return -1;
bda52162
JA
7491}
7492
af9c1a44
JA
7493static int io_run_task_work_sig(void)
7494{
7495 if (io_run_task_work())
7496 return 1;
7497 if (!signal_pending(current))
7498 return 0;
0b8cfa97 7499 if (test_thread_flag(TIF_NOTIFY_SIGNAL))
792ee0f6 7500 return -ERESTARTSYS;
af9c1a44
JA
7501 return -EINTR;
7502}
7503
eeb60b9a
PB
7504/* when returns >0, the caller should retry */
7505static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7506 struct io_wait_queue *iowq,
7507 signed long *timeout)
7508{
7509 int ret;
7510
7511 /* make sure we run task_work before checking for signals */
7512 ret = io_run_task_work_sig();
7513 if (ret || io_should_wake(iowq))
7514 return ret;
7515 /* let the caller flush overflows, retry */
5ed7a37d 7516 if (test_bit(0, &ctx->check_cq_overflow))
eeb60b9a
PB
7517 return 1;
7518
7519 *timeout = schedule_timeout(*timeout);
7520 return !*timeout ? -ETIME : 1;
7521}
7522
2b188cc1
JA
7523/*
7524 * Wait until events become available, if we don't already have some. The
7525 * application must reap them itself, as they reside on the shared cq ring.
7526 */
7527static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
c73ebb68
HX
7528 const sigset_t __user *sig, size_t sigsz,
7529 struct __kernel_timespec __user *uts)
2b188cc1 7530{
90291099 7531 struct io_wait_queue iowq;
75b28aff 7532 struct io_rings *rings = ctx->rings;
c1d5a224
PB
7533 signed long timeout = MAX_SCHEDULE_TIMEOUT;
7534 int ret;
2b188cc1 7535
b41e9852 7536 do {
90f67366 7537 io_cqring_overflow_flush(ctx);
6c503150 7538 if (io_cqring_events(ctx) >= min_events)
b41e9852 7539 return 0;
4c6e277c 7540 if (!io_run_task_work())
b41e9852 7541 break;
b41e9852 7542 } while (1);
2b188cc1 7543
44df58d4
XW
7544 if (uts) {
7545 struct timespec64 ts;
7546
7547 if (get_timespec64(&ts, uts))
7548 return -EFAULT;
7549 timeout = timespec64_to_jiffies(&ts);
7550 }
7551
2b188cc1 7552 if (sig) {
9e75ad5d
AB
7553#ifdef CONFIG_COMPAT
7554 if (in_compat_syscall())
7555 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 7556 sigsz);
9e75ad5d
AB
7557 else
7558#endif
b772434b 7559 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 7560
2b188cc1
JA
7561 if (ret)
7562 return ret;
7563 }
7564
90291099
PB
7565 init_waitqueue_func_entry(&iowq.wq, io_wake_function);
7566 iowq.wq.private = current;
7567 INIT_LIST_HEAD(&iowq.wq.entry);
7568 iowq.ctx = ctx;
bda52162 7569 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
5fd46178 7570 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
90291099 7571
c826bd7a 7572 trace_io_uring_cqring_wait(ctx, min_events);
bda52162 7573 do {
ca0a2651 7574 /* if we can't even flush overflow, don't wait for more */
90f67366 7575 if (!io_cqring_overflow_flush(ctx)) {
ca0a2651
JA
7576 ret = -EBUSY;
7577 break;
7578 }
311997b3 7579 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
bda52162 7580 TASK_INTERRUPTIBLE);
eeb60b9a 7581 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
311997b3 7582 finish_wait(&ctx->cq_wait, &iowq.wq);
ca0a2651 7583 cond_resched();
eeb60b9a 7584 } while (ret > 0);
bda52162 7585
b7db41c9 7586 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 7587
75b28aff 7588 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
7589}
7590
9123c8ff 7591static void io_free_page_table(void **table, size_t size)
05f3fb3c 7592{
9123c8ff 7593 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
05f3fb3c 7594
846a4ef2 7595 for (i = 0; i < nr_tables; i++)
9123c8ff
PB
7596 kfree(table[i]);
7597 kfree(table);
7598}
7599
c072481d 7600static __cold void **io_alloc_page_table(size_t size)
9123c8ff
PB
7601{
7602 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7603 size_t init_size = size;
7604 void **table;
7605
0bea96f5 7606 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
9123c8ff
PB
7607 if (!table)
7608 return NULL;
7609
7610 for (i = 0; i < nr_tables; i++) {
27f6b318 7611 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
9123c8ff 7612
0bea96f5 7613 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
9123c8ff
PB
7614 if (!table[i]) {
7615 io_free_page_table(table, init_size);
7616 return NULL;
7617 }
7618 size -= this_size;
7619 }
7620 return table;
05f3fb3c
JA
7621}
7622
28a9fe25 7623static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
1642b445 7624{
28a9fe25
PB
7625 percpu_ref_exit(&ref_node->refs);
7626 kfree(ref_node);
1642b445
PB
7627}
7628
c072481d 7629static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
b9bd2bea
PB
7630{
7631 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
7632 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
7633 unsigned long flags;
7634 bool first_add = false;
7635
7636 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
7637 node->done = true;
7638
7639 while (!list_empty(&ctx->rsrc_ref_list)) {
7640 node = list_first_entry(&ctx->rsrc_ref_list,
7641 struct io_rsrc_node, node);
7642 /* recycle ref nodes in order */
7643 if (!node->done)
7644 break;
7645 list_del(&node->node);
7646 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
7647 }
7648 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
7649
7650 if (first_add)
7651 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
7652}
7653
7654static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
7655{
7656 struct io_rsrc_node *ref_node;
7657
7658 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7659 if (!ref_node)
7660 return NULL;
7661
7662 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7663 0, GFP_KERNEL)) {
7664 kfree(ref_node);
7665 return NULL;
7666 }
7667 INIT_LIST_HEAD(&ref_node->node);
7668 INIT_LIST_HEAD(&ref_node->rsrc_list);
7669 ref_node->done = false;
7670 return ref_node;
7671}
7672
a7f0ed5a
PB
7673static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
7674 struct io_rsrc_data *data_to_kill)
ab409402 7675 __must_hold(&ctx->uring_lock)
6b06314c 7676{
a7f0ed5a
PB
7677 WARN_ON_ONCE(!ctx->rsrc_backup_node);
7678 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
6b06314c 7679
ab409402
PB
7680 io_rsrc_refs_drop(ctx);
7681
a7f0ed5a
PB
7682 if (data_to_kill) {
7683 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
82fbcfa9 7684
a7f0ed5a 7685 rsrc_node->rsrc_data = data_to_kill;
4956b9ea 7686 spin_lock_irq(&ctx->rsrc_ref_lock);
a7f0ed5a 7687 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
4956b9ea 7688 spin_unlock_irq(&ctx->rsrc_ref_lock);
82fbcfa9 7689
3e942498 7690 atomic_inc(&data_to_kill->refs);
a7f0ed5a
PB
7691 percpu_ref_kill(&rsrc_node->refs);
7692 ctx->rsrc_node = NULL;
7693 }
6b06314c 7694
a7f0ed5a
PB
7695 if (!ctx->rsrc_node) {
7696 ctx->rsrc_node = ctx->rsrc_backup_node;
7697 ctx->rsrc_backup_node = NULL;
7698 }
8bad28d8
HX
7699}
7700
a7f0ed5a 7701static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
8dd03afe
PB
7702{
7703 if (ctx->rsrc_backup_node)
7704 return 0;
b895c9a6 7705 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
8dd03afe 7706 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
8bad28d8
HX
7707}
7708
c072481d
PB
7709static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
7710 struct io_ring_ctx *ctx)
8bad28d8
HX
7711{
7712 int ret;
05589553 7713
215c3902 7714 /* As we may drop ->uring_lock, other task may have started quiesce */
8bad28d8
HX
7715 if (data->quiesce)
7716 return -ENXIO;
05589553 7717
8bad28d8 7718 data->quiesce = true;
1ffc5422 7719 do {
a7f0ed5a 7720 ret = io_rsrc_node_switch_start(ctx);
8dd03afe 7721 if (ret)
f2303b1f 7722 break;
a7f0ed5a 7723 io_rsrc_node_switch(ctx, data);
f2303b1f 7724
3e942498
PB
7725 /* kill initial ref, already quiesced if zero */
7726 if (atomic_dec_and_test(&data->refs))
7727 break;
c018db4a 7728 mutex_unlock(&ctx->uring_lock);
8bad28d8 7729 flush_delayed_work(&ctx->rsrc_put_work);
1ffc5422 7730 ret = wait_for_completion_interruptible(&data->done);
c018db4a
JA
7731 if (!ret) {
7732 mutex_lock(&ctx->uring_lock);
1ffc5422 7733 break;
c018db4a 7734 }
8bad28d8 7735
3e942498
PB
7736 atomic_inc(&data->refs);
7737 /* wait for all works potentially completing data->done */
7738 flush_delayed_work(&ctx->rsrc_put_work);
cb5e1b81 7739 reinit_completion(&data->done);
8dd03afe 7740
1ffc5422 7741 ret = io_run_task_work_sig();
8bad28d8 7742 mutex_lock(&ctx->uring_lock);
f2303b1f 7743 } while (ret >= 0);
8bad28d8 7744 data->quiesce = false;
05f3fb3c 7745
8bad28d8 7746 return ret;
d7954b2b
BM
7747}
7748
2d091d62
PB
7749static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
7750{
7751 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
7752 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
7753
7754 return &data->tags[table_idx][off];
7755}
7756
44b31f2f 7757static void io_rsrc_data_free(struct io_rsrc_data *data)
1ad555c6 7758{
2d091d62
PB
7759 size_t size = data->nr * sizeof(data->tags[0][0]);
7760
7761 if (data->tags)
7762 io_free_page_table((void **)data->tags, size);
44b31f2f
PB
7763 kfree(data);
7764}
7765
c072481d
PB
7766static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
7767 u64 __user *utags, unsigned nr,
7768 struct io_rsrc_data **pdata)
1ad555c6 7769{
b895c9a6 7770 struct io_rsrc_data *data;
2d091d62 7771 int ret = -ENOMEM;
d878c816 7772 unsigned i;
1ad555c6
BM
7773
7774 data = kzalloc(sizeof(*data), GFP_KERNEL);
7775 if (!data)
d878c816 7776 return -ENOMEM;
2d091d62 7777 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
b60c8dce 7778 if (!data->tags) {
1ad555c6 7779 kfree(data);
d878c816
PB
7780 return -ENOMEM;
7781 }
2d091d62
PB
7782
7783 data->nr = nr;
7784 data->ctx = ctx;
7785 data->do_put = do_put;
d878c816 7786 if (utags) {
2d091d62 7787 ret = -EFAULT;
d878c816 7788 for (i = 0; i < nr; i++) {
fdd1dc31
CIK
7789 u64 *tag_slot = io_get_tag_slot(data, i);
7790
7791 if (copy_from_user(tag_slot, &utags[i],
7792 sizeof(*tag_slot)))
2d091d62 7793 goto fail;
d878c816 7794 }
1ad555c6 7795 }
b60c8dce 7796
3e942498 7797 atomic_set(&data->refs, 1);
1ad555c6 7798 init_completion(&data->done);
d878c816
PB
7799 *pdata = data;
7800 return 0;
2d091d62
PB
7801fail:
7802 io_rsrc_data_free(data);
7803 return ret;
1ad555c6
BM
7804}
7805
9123c8ff
PB
7806static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
7807{
0bea96f5
PB
7808 table->files = kvcalloc(nr_files, sizeof(table->files[0]),
7809 GFP_KERNEL_ACCOUNT);
9123c8ff
PB
7810 return !!table->files;
7811}
7812
042b0d85 7813static void io_free_file_tables(struct io_file_table *table)
9123c8ff 7814{
042b0d85 7815 kvfree(table->files);
9123c8ff
PB
7816 table->files = NULL;
7817}
7818
fff4db76 7819static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
1ad555c6 7820{
fff4db76
PB
7821#if defined(CONFIG_UNIX)
7822 if (ctx->ring_sock) {
7823 struct sock *sock = ctx->ring_sock->sk;
7824 struct sk_buff *skb;
7825
7826 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7827 kfree_skb(skb);
7828 }
7829#else
7830 int i;
7831
7832 for (i = 0; i < ctx->nr_user_files; i++) {
7833 struct file *file;
7834
7835 file = io_file_from_index(ctx, i);
7836 if (file)
7837 fput(file);
7838 }
7839#endif
042b0d85 7840 io_free_file_tables(&ctx->file_table);
44b31f2f 7841 io_rsrc_data_free(ctx->file_data);
fff4db76
PB
7842 ctx->file_data = NULL;
7843 ctx->nr_user_files = 0;
1ad555c6
BM
7844}
7845
d7954b2b
BM
7846static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7847{
d7954b2b
BM
7848 int ret;
7849
08480400 7850 if (!ctx->file_data)
d7954b2b 7851 return -ENXIO;
08480400
PB
7852 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
7853 if (!ret)
7854 __io_sqe_files_unregister(ctx);
7855 return ret;
6b06314c
JA
7856}
7857
37d1e2e3 7858static void io_sq_thread_unpark(struct io_sq_data *sqd)
09a6f4ef 7859 __releases(&sqd->lock)
37d1e2e3 7860{
521d6a73
PB
7861 WARN_ON_ONCE(sqd->thread == current);
7862
9e138a48
PB
7863 /*
7864 * Do the dance but not conditional clear_bit() because it'd race with
7865 * other threads incrementing park_pending and setting the bit.
7866 */
37d1e2e3 7867 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
9e138a48
PB
7868 if (atomic_dec_return(&sqd->park_pending))
7869 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 7870 mutex_unlock(&sqd->lock);
37d1e2e3
JA
7871}
7872
86e0d676 7873static void io_sq_thread_park(struct io_sq_data *sqd)
09a6f4ef 7874 __acquires(&sqd->lock)
37d1e2e3 7875{
521d6a73
PB
7876 WARN_ON_ONCE(sqd->thread == current);
7877
9e138a48 7878 atomic_inc(&sqd->park_pending);
86e0d676 7879 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 7880 mutex_lock(&sqd->lock);
05962f95 7881 if (sqd->thread)
86e0d676 7882 wake_up_process(sqd->thread);
37d1e2e3
JA
7883}
7884
7885static void io_sq_thread_stop(struct io_sq_data *sqd)
7886{
521d6a73 7887 WARN_ON_ONCE(sqd->thread == current);
88885f66 7888 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
521d6a73 7889
05962f95 7890 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
88885f66 7891 mutex_lock(&sqd->lock);
e8f98f24
JA
7892 if (sqd->thread)
7893 wake_up_process(sqd->thread);
09a6f4ef 7894 mutex_unlock(&sqd->lock);
05962f95 7895 wait_for_completion(&sqd->exited);
37d1e2e3
JA
7896}
7897
534ca6d6 7898static void io_put_sq_data(struct io_sq_data *sqd)
6c271ce2 7899{
534ca6d6 7900 if (refcount_dec_and_test(&sqd->refs)) {
9e138a48
PB
7901 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
7902
37d1e2e3
JA
7903 io_sq_thread_stop(sqd);
7904 kfree(sqd);
7905 }
7906}
7907
7908static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7909{
7910 struct io_sq_data *sqd = ctx->sq_data;
7911
7912 if (sqd) {
05962f95 7913 io_sq_thread_park(sqd);
521d6a73 7914 list_del_init(&ctx->sqd_list);
37d1e2e3 7915 io_sqd_update_thread_idle(sqd);
05962f95 7916 io_sq_thread_unpark(sqd);
37d1e2e3
JA
7917
7918 io_put_sq_data(sqd);
7919 ctx->sq_data = NULL;
534ca6d6
JA
7920 }
7921}
7922
aa06165d
JA
7923static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7924{
7925 struct io_ring_ctx *ctx_attach;
7926 struct io_sq_data *sqd;
7927 struct fd f;
7928
7929 f = fdget(p->wq_fd);
7930 if (!f.file)
7931 return ERR_PTR(-ENXIO);
7932 if (f.file->f_op != &io_uring_fops) {
7933 fdput(f);
7934 return ERR_PTR(-EINVAL);
7935 }
7936
7937 ctx_attach = f.file->private_data;
7938 sqd = ctx_attach->sq_data;
7939 if (!sqd) {
7940 fdput(f);
7941 return ERR_PTR(-EINVAL);
7942 }
5c2469e0
JA
7943 if (sqd->task_tgid != current->tgid) {
7944 fdput(f);
7945 return ERR_PTR(-EPERM);
7946 }
aa06165d
JA
7947
7948 refcount_inc(&sqd->refs);
7949 fdput(f);
7950 return sqd;
7951}
7952
26984fbf
PB
7953static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
7954 bool *attached)
534ca6d6
JA
7955{
7956 struct io_sq_data *sqd;
7957
26984fbf 7958 *attached = false;
5c2469e0
JA
7959 if (p->flags & IORING_SETUP_ATTACH_WQ) {
7960 sqd = io_attach_sq_data(p);
26984fbf
PB
7961 if (!IS_ERR(sqd)) {
7962 *attached = true;
5c2469e0 7963 return sqd;
26984fbf 7964 }
5c2469e0
JA
7965 /* fall through for EPERM case, setup new sqd/task */
7966 if (PTR_ERR(sqd) != -EPERM)
7967 return sqd;
7968 }
aa06165d 7969
534ca6d6
JA
7970 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7971 if (!sqd)
7972 return ERR_PTR(-ENOMEM);
7973
9e138a48 7974 atomic_set(&sqd->park_pending, 0);
534ca6d6 7975 refcount_set(&sqd->refs, 1);
69fb2131 7976 INIT_LIST_HEAD(&sqd->ctx_list);
09a6f4ef 7977 mutex_init(&sqd->lock);
534ca6d6 7978 init_waitqueue_head(&sqd->wait);
37d1e2e3 7979 init_completion(&sqd->exited);
534ca6d6
JA
7980 return sqd;
7981}
7982
6b06314c 7983#if defined(CONFIG_UNIX)
6b06314c
JA
7984/*
7985 * Ensure the UNIX gc is aware of our file set, so we are certain that
7986 * the io_uring can be safely unregistered on process exit, even if we have
7987 * loops in the file referencing.
7988 */
7989static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7990{
7991 struct sock *sk = ctx->ring_sock->sk;
7992 struct scm_fp_list *fpl;
7993 struct sk_buff *skb;
08a45173 7994 int i, nr_files;
6b06314c 7995
6b06314c
JA
7996 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7997 if (!fpl)
7998 return -ENOMEM;
7999
8000 skb = alloc_skb(0, GFP_KERNEL);
8001 if (!skb) {
8002 kfree(fpl);
8003 return -ENOMEM;
8004 }
8005
8006 skb->sk = sk;
6b06314c 8007
08a45173 8008 nr_files = 0;
62e398be 8009 fpl->user = get_uid(current_user());
6b06314c 8010 for (i = 0; i < nr; i++) {
65e19f54
JA
8011 struct file *file = io_file_from_index(ctx, i + offset);
8012
8013 if (!file)
08a45173 8014 continue;
65e19f54 8015 fpl->fp[nr_files] = get_file(file);
08a45173
JA
8016 unix_inflight(fpl->user, fpl->fp[nr_files]);
8017 nr_files++;
6b06314c
JA
8018 }
8019
08a45173
JA
8020 if (nr_files) {
8021 fpl->max = SCM_MAX_FD;
8022 fpl->count = nr_files;
8023 UNIXCB(skb).fp = fpl;
05f3fb3c 8024 skb->destructor = unix_destruct_scm;
08a45173
JA
8025 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
8026 skb_queue_head(&sk->sk_receive_queue, skb);
6b06314c 8027
08a45173
JA
8028 for (i = 0; i < nr_files; i++)
8029 fput(fpl->fp[i]);
8030 } else {
8031 kfree_skb(skb);
8032 kfree(fpl);
8033 }
6b06314c
JA
8034
8035 return 0;
8036}
8037
8038/*
8039 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
8040 * causes regular reference counting to break down. We rely on the UNIX
8041 * garbage collection to take care of this problem for us.
8042 */
8043static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8044{
8045 unsigned left, total;
8046 int ret = 0;
8047
8048 total = 0;
8049 left = ctx->nr_user_files;
8050 while (left) {
8051 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6b06314c
JA
8052
8053 ret = __io_sqe_files_scm(ctx, this_files, total);
8054 if (ret)
8055 break;
8056 left -= this_files;
8057 total += this_files;
8058 }
8059
8060 if (!ret)
8061 return 0;
8062
8063 while (total < ctx->nr_user_files) {
65e19f54
JA
8064 struct file *file = io_file_from_index(ctx, total);
8065
8066 if (file)
8067 fput(file);
6b06314c
JA
8068 total++;
8069 }
8070
8071 return ret;
8072}
8073#else
8074static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8075{
8076 return 0;
8077}
8078#endif
8079
47e90392 8080static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
05f3fb3c 8081{
50238531 8082 struct file *file = prsrc->file;
05f3fb3c
JA
8083#if defined(CONFIG_UNIX)
8084 struct sock *sock = ctx->ring_sock->sk;
8085 struct sk_buff_head list, *head = &sock->sk_receive_queue;
8086 struct sk_buff *skb;
8087 int i;
8088
8089 __skb_queue_head_init(&list);
8090
8091 /*
8092 * Find the skb that holds this file in its SCM_RIGHTS. When found,
8093 * remove this entry and rearrange the file array.
8094 */
8095 skb = skb_dequeue(head);
8096 while (skb) {
8097 struct scm_fp_list *fp;
8098
8099 fp = UNIXCB(skb).fp;
8100 for (i = 0; i < fp->count; i++) {
8101 int left;
8102
8103 if (fp->fp[i] != file)
8104 continue;
8105
8106 unix_notinflight(fp->user, fp->fp[i]);
8107 left = fp->count - 1 - i;
8108 if (left) {
8109 memmove(&fp->fp[i], &fp->fp[i + 1],
8110 left * sizeof(struct file *));
8111 }
8112 fp->count--;
8113 if (!fp->count) {
8114 kfree_skb(skb);
8115 skb = NULL;
8116 } else {
8117 __skb_queue_tail(&list, skb);
8118 }
8119 fput(file);
8120 file = NULL;
8121 break;
8122 }
8123
8124 if (!file)
8125 break;
8126
8127 __skb_queue_tail(&list, skb);
8128
8129 skb = skb_dequeue(head);
8130 }
8131
8132 if (skb_peek(&list)) {
8133 spin_lock_irq(&head->lock);
8134 while ((skb = __skb_dequeue(&list)) != NULL)
8135 __skb_queue_tail(head, skb);
8136 spin_unlock_irq(&head->lock);
8137 }
8138#else
8139 fput(file);
8140#endif
8141}
8142
b895c9a6 8143static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
65e19f54 8144{
b895c9a6 8145 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
269bbe5f
BM
8146 struct io_ring_ctx *ctx = rsrc_data->ctx;
8147 struct io_rsrc_put *prsrc, *tmp;
05589553 8148
269bbe5f
BM
8149 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8150 list_del(&prsrc->list);
b60c8dce
PB
8151
8152 if (prsrc->tag) {
8153 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
b60c8dce
PB
8154
8155 io_ring_submit_lock(ctx, lock_ring);
79ebeaee 8156 spin_lock(&ctx->completion_lock);
b60c8dce 8157 io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
2840f710 8158 ctx->cq_extra++;
b60c8dce 8159 io_commit_cqring(ctx);
79ebeaee 8160 spin_unlock(&ctx->completion_lock);
b60c8dce
PB
8161 io_cqring_ev_posted(ctx);
8162 io_ring_submit_unlock(ctx, lock_ring);
8163 }
8164
40ae0ff7 8165 rsrc_data->do_put(ctx, prsrc);
269bbe5f 8166 kfree(prsrc);
65e19f54 8167 }
05589553 8168
28a9fe25 8169 io_rsrc_node_destroy(ref_node);
3e942498
PB
8170 if (atomic_dec_and_test(&rsrc_data->refs))
8171 complete(&rsrc_data->done);
2faf852d 8172}
65e19f54 8173
269bbe5f 8174static void io_rsrc_put_work(struct work_struct *work)
4a38aed2
JA
8175{
8176 struct io_ring_ctx *ctx;
8177 struct llist_node *node;
8178
269bbe5f
BM
8179 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
8180 node = llist_del_all(&ctx->rsrc_put_llist);
4a38aed2
JA
8181
8182 while (node) {
b895c9a6 8183 struct io_rsrc_node *ref_node;
4a38aed2
JA
8184 struct llist_node *next = node->next;
8185
b895c9a6 8186 ref_node = llist_entry(node, struct io_rsrc_node, llist);
269bbe5f 8187 __io_rsrc_put_work(ref_node);
4a38aed2
JA
8188 node = next;
8189 }
8190}
8191
6b06314c 8192static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
792e3582 8193 unsigned nr_args, u64 __user *tags)
6b06314c
JA
8194{
8195 __s32 __user *fds = (__s32 __user *) arg;
05f3fb3c 8196 struct file *file;
f3baed39 8197 int fd, ret;
846a4ef2 8198 unsigned i;
6b06314c 8199
05f3fb3c 8200 if (ctx->file_data)
6b06314c
JA
8201 return -EBUSY;
8202 if (!nr_args)
8203 return -EINVAL;
8204 if (nr_args > IORING_MAX_FIXED_FILES)
8205 return -EMFILE;
3a1b8a4e
PB
8206 if (nr_args > rlimit(RLIMIT_NOFILE))
8207 return -EMFILE;
a7f0ed5a 8208 ret = io_rsrc_node_switch_start(ctx);
f3baed39
PB
8209 if (ret)
8210 return ret;
d878c816
PB
8211 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
8212 &ctx->file_data);
8213 if (ret)
8214 return ret;
6b06314c 8215
f3baed39 8216 ret = -ENOMEM;
aeca241b 8217 if (!io_alloc_file_tables(&ctx->file_table, nr_args))
1ad555c6 8218 goto out_free;
65e19f54 8219
08a45173 8220 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
d878c816 8221 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
600cf3f8
PB
8222 ret = -EFAULT;
8223 goto out_fput;
8224 }
08a45173 8225 /* allow sparse sets */
792e3582
PB
8226 if (fd == -1) {
8227 ret = -EINVAL;
2d091d62 8228 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
792e3582 8229 goto out_fput;
08a45173 8230 continue;
792e3582 8231 }
6b06314c 8232
05f3fb3c 8233 file = fget(fd);
6b06314c 8234 ret = -EBADF;
792e3582 8235 if (unlikely(!file))
600cf3f8 8236 goto out_fput;
05f3fb3c 8237
6b06314c
JA
8238 /*
8239 * Don't allow io_uring instances to be registered. If UNIX
8240 * isn't enabled, then this causes a reference cycle and this
8241 * instance can never get freed. If UNIX is enabled we'll
8242 * handle it just fine, but there's still no point in allowing
8243 * a ring fd as it doesn't support regular read/write anyway.
8244 */
05f3fb3c
JA
8245 if (file->f_op == &io_uring_fops) {
8246 fput(file);
600cf3f8 8247 goto out_fput;
6b06314c 8248 }
aeca241b 8249 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
6b06314c
JA
8250 }
8251
6b06314c 8252 ret = io_sqe_files_scm(ctx);
05589553 8253 if (ret) {
08480400 8254 __io_sqe_files_unregister(ctx);
05589553
XW
8255 return ret;
8256 }
6b06314c 8257
a7f0ed5a 8258 io_rsrc_node_switch(ctx, NULL);
6b06314c 8259 return ret;
600cf3f8
PB
8260out_fput:
8261 for (i = 0; i < ctx->nr_user_files; i++) {
8262 file = io_file_from_index(ctx, i);
8263 if (file)
8264 fput(file);
8265 }
042b0d85 8266 io_free_file_tables(&ctx->file_table);
600cf3f8 8267 ctx->nr_user_files = 0;
600cf3f8 8268out_free:
44b31f2f 8269 io_rsrc_data_free(ctx->file_data);
55cbc256 8270 ctx->file_data = NULL;
6b06314c
JA
8271 return ret;
8272}
8273
c3a31e60
JA
8274static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
8275 int index)
8276{
8277#if defined(CONFIG_UNIX)
8278 struct sock *sock = ctx->ring_sock->sk;
8279 struct sk_buff_head *head = &sock->sk_receive_queue;
8280 struct sk_buff *skb;
8281
8282 /*
8283 * See if we can merge this file into an existing skb SCM_RIGHTS
8284 * file set. If there's no room, fall back to allocating a new skb
8285 * and filling it in.
8286 */
8287 spin_lock_irq(&head->lock);
8288 skb = skb_peek(head);
8289 if (skb) {
8290 struct scm_fp_list *fpl = UNIXCB(skb).fp;
8291
8292 if (fpl->count < SCM_MAX_FD) {
8293 __skb_unlink(skb, head);
8294 spin_unlock_irq(&head->lock);
8295 fpl->fp[fpl->count] = get_file(file);
8296 unix_inflight(fpl->user, fpl->fp[fpl->count]);
8297 fpl->count++;
8298 spin_lock_irq(&head->lock);
8299 __skb_queue_head(head, skb);
8300 } else {
8301 skb = NULL;
8302 }
8303 }
8304 spin_unlock_irq(&head->lock);
8305
8306 if (skb) {
8307 fput(file);
8308 return 0;
8309 }
8310
8311 return __io_sqe_files_scm(ctx, 1, index);
8312#else
8313 return 0;
8314#endif
8315}
8316
9c7b0ba8
PB
8317static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8318 struct io_rsrc_node *node, void *rsrc)
8319{
8320 struct io_rsrc_put *prsrc;
8321
8322 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8323 if (!prsrc)
8324 return -ENOMEM;
8325
8326 prsrc->tag = *io_get_tag_slot(data, idx);
8327 prsrc->rsrc = rsrc;
8328 list_add(&prsrc->list, &node->rsrc_list);
8329 return 0;
8330}
8331
b9445598
PB
8332static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8333 unsigned int issue_flags, u32 slot_index)
8334{
8335 struct io_ring_ctx *ctx = req->ctx;
8336 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
9c7b0ba8 8337 bool needs_switch = false;
b9445598
PB
8338 struct io_fixed_file *file_slot;
8339 int ret = -EBADF;
8340
8341 io_ring_submit_lock(ctx, !force_nonblock);
8342 if (file->f_op == &io_uring_fops)
8343 goto err;
8344 ret = -ENXIO;
8345 if (!ctx->file_data)
8346 goto err;
8347 ret = -EINVAL;
8348 if (slot_index >= ctx->nr_user_files)
8349 goto err;
8350
8351 slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8352 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
9c7b0ba8
PB
8353
8354 if (file_slot->file_ptr) {
8355 struct file *old_file;
8356
8357 ret = io_rsrc_node_switch_start(ctx);
8358 if (ret)
8359 goto err;
8360
8361 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8362 ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
8363 ctx->rsrc_node, old_file);
8364 if (ret)
8365 goto err;
8366 file_slot->file_ptr = 0;
8367 needs_switch = true;
8368 }
b9445598
PB
8369
8370 *io_get_tag_slot(ctx->file_data, slot_index) = 0;
8371 io_fixed_file_set(file_slot, file);
8372 ret = io_sqe_file_register(ctx, file, slot_index);
8373 if (ret) {
8374 file_slot->file_ptr = 0;
8375 goto err;
8376 }
8377
8378 ret = 0;
8379err:
9c7b0ba8
PB
8380 if (needs_switch)
8381 io_rsrc_node_switch(ctx, ctx->file_data);
b9445598
PB
8382 io_ring_submit_unlock(ctx, !force_nonblock);
8383 if (ret)
8384 fput(file);
8385 return ret;
8386}
8387
7df778be
PB
8388static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
8389{
8390 unsigned int offset = req->close.file_slot - 1;
8391 struct io_ring_ctx *ctx = req->ctx;
8392 struct io_fixed_file *file_slot;
8393 struct file *file;
8394 int ret, i;
8395
8396 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
8397 ret = -ENXIO;
8398 if (unlikely(!ctx->file_data))
8399 goto out;
8400 ret = -EINVAL;
8401 if (offset >= ctx->nr_user_files)
8402 goto out;
8403 ret = io_rsrc_node_switch_start(ctx);
8404 if (ret)
8405 goto out;
8406
8407 i = array_index_nospec(offset, ctx->nr_user_files);
8408 file_slot = io_fixed_file_slot(&ctx->file_table, i);
8409 ret = -EBADF;
8410 if (!file_slot->file_ptr)
8411 goto out;
8412
8413 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8414 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
8415 if (ret)
8416 goto out;
8417
8418 file_slot->file_ptr = 0;
8419 io_rsrc_node_switch(ctx, ctx->file_data);
8420 ret = 0;
8421out:
8422 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
8423 return ret;
8424}
8425
05f3fb3c 8426static int __io_sqe_files_update(struct io_ring_ctx *ctx,
c3bdad02 8427 struct io_uring_rsrc_update2 *up,
05f3fb3c
JA
8428 unsigned nr_args)
8429{
c3bdad02 8430 u64 __user *tags = u64_to_user_ptr(up->tags);
98f0b3b4 8431 __s32 __user *fds = u64_to_user_ptr(up->data);
b895c9a6 8432 struct io_rsrc_data *data = ctx->file_data;
a04b0ac0
PB
8433 struct io_fixed_file *file_slot;
8434 struct file *file;
98f0b3b4
PB
8435 int fd, i, err = 0;
8436 unsigned int done;
05589553 8437 bool needs_switch = false;
c3a31e60 8438
98f0b3b4
PB
8439 if (!ctx->file_data)
8440 return -ENXIO;
8441 if (up->offset + nr_args > ctx->nr_user_files)
c3a31e60
JA
8442 return -EINVAL;
8443
67973b93 8444 for (done = 0; done < nr_args; done++) {
c3bdad02
PB
8445 u64 tag = 0;
8446
8447 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
8448 copy_from_user(&fd, &fds[done], sizeof(fd))) {
c3a31e60
JA
8449 err = -EFAULT;
8450 break;
8451 }
c3bdad02
PB
8452 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
8453 err = -EINVAL;
8454 break;
8455 }
4e0377a1 8456 if (fd == IORING_REGISTER_FILES_SKIP)
8457 continue;
8458
67973b93 8459 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
aeca241b 8460 file_slot = io_fixed_file_slot(&ctx->file_table, i);
ea64ec02 8461
a04b0ac0
PB
8462 if (file_slot->file_ptr) {
8463 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
b60c8dce
PB
8464 err = io_queue_rsrc_removal(data, up->offset + done,
8465 ctx->rsrc_node, file);
a5318d3c
HD
8466 if (err)
8467 break;
a04b0ac0 8468 file_slot->file_ptr = 0;
05589553 8469 needs_switch = true;
c3a31e60
JA
8470 }
8471 if (fd != -1) {
c3a31e60
JA
8472 file = fget(fd);
8473 if (!file) {
8474 err = -EBADF;
8475 break;
8476 }
8477 /*
8478 * Don't allow io_uring instances to be registered. If
8479 * UNIX isn't enabled, then this causes a reference
8480 * cycle and this instance can never get freed. If UNIX
8481 * is enabled we'll handle it just fine, but there's
8482 * still no point in allowing a ring fd as it doesn't
8483 * support regular read/write anyway.
8484 */
8485 if (file->f_op == &io_uring_fops) {
8486 fput(file);
8487 err = -EBADF;
8488 break;
8489 }
2d091d62 8490 *io_get_tag_slot(data, up->offset + done) = tag;
9a321c98 8491 io_fixed_file_set(file_slot, file);
c3a31e60 8492 err = io_sqe_file_register(ctx, file, i);
f3bd9dae 8493 if (err) {
a04b0ac0 8494 file_slot->file_ptr = 0;
f3bd9dae 8495 fput(file);
c3a31e60 8496 break;
f3bd9dae 8497 }
c3a31e60 8498 }
05f3fb3c
JA
8499 }
8500
a7f0ed5a
PB
8501 if (needs_switch)
8502 io_rsrc_node_switch(ctx, data);
c3a31e60
JA
8503 return done ? done : err;
8504}
05589553 8505
685fe7fe
JA
8506static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
8507 struct task_struct *task)
24369c2e 8508{
e941894e 8509 struct io_wq_hash *hash;
24369c2e 8510 struct io_wq_data data;
24369c2e 8511 unsigned int concurrency;
24369c2e 8512
362a9e65 8513 mutex_lock(&ctx->uring_lock);
e941894e
JA
8514 hash = ctx->hash_map;
8515 if (!hash) {
8516 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
362a9e65
YY
8517 if (!hash) {
8518 mutex_unlock(&ctx->uring_lock);
e941894e 8519 return ERR_PTR(-ENOMEM);
362a9e65 8520 }
e941894e
JA
8521 refcount_set(&hash->refs, 1);
8522 init_waitqueue_head(&hash->wait);
8523 ctx->hash_map = hash;
24369c2e 8524 }
362a9e65 8525 mutex_unlock(&ctx->uring_lock);
24369c2e 8526
e941894e 8527 data.hash = hash;
685fe7fe 8528 data.task = task;
ebc11b6c 8529 data.free_work = io_wq_free_work;
f5fa38c5 8530 data.do_work = io_wq_submit_work;
24369c2e 8531
d25e3a3d
JA
8532 /* Do QD, or 4 * CPUS, whatever is smallest */
8533 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
24369c2e 8534
5aa75ed5 8535 return io_wq_create(concurrency, &data);
24369c2e
PB
8536}
8537
c072481d
PB
8538static __cold int io_uring_alloc_task_context(struct task_struct *task,
8539 struct io_ring_ctx *ctx)
0f212204
JA
8540{
8541 struct io_uring_task *tctx;
d8a6df10 8542 int ret;
0f212204 8543
09899b19 8544 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
0f212204
JA
8545 if (unlikely(!tctx))
8546 return -ENOMEM;
8547
d8a6df10
JA
8548 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
8549 if (unlikely(ret)) {
8550 kfree(tctx);
8551 return ret;
8552 }
8553
685fe7fe 8554 tctx->io_wq = io_init_wq_offload(ctx, task);
5aa75ed5
JA
8555 if (IS_ERR(tctx->io_wq)) {
8556 ret = PTR_ERR(tctx->io_wq);
8557 percpu_counter_destroy(&tctx->inflight);
8558 kfree(tctx);
8559 return ret;
8560 }
8561
0f212204
JA
8562 xa_init(&tctx->xa);
8563 init_waitqueue_head(&tctx->wait);
fdaf083c 8564 atomic_set(&tctx->in_idle, 0);
b303fe2e 8565 atomic_set(&tctx->inflight_tracked, 0);
0f212204 8566 task->io_uring = tctx;
7cbf1722
JA
8567 spin_lock_init(&tctx->task_lock);
8568 INIT_WQ_LIST(&tctx->task_list);
7cbf1722 8569 init_task_work(&tctx->task_work, tctx_task_work);
0f212204
JA
8570 return 0;
8571}
8572
8573void __io_uring_free(struct task_struct *tsk)
8574{
8575 struct io_uring_task *tctx = tsk->io_uring;
8576
8577 WARN_ON_ONCE(!xa_empty(&tctx->xa));
ef8eaa4e 8578 WARN_ON_ONCE(tctx->io_wq);
09899b19 8579 WARN_ON_ONCE(tctx->cached_refs);
ef8eaa4e 8580
d8a6df10 8581 percpu_counter_destroy(&tctx->inflight);
0f212204
JA
8582 kfree(tctx);
8583 tsk->io_uring = NULL;
8584}
8585
c072481d
PB
8586static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
8587 struct io_uring_params *p)
2b188cc1
JA
8588{
8589 int ret;
8590
d25e3a3d
JA
8591 /* Retain compatibility with failing for an invalid attach attempt */
8592 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
8593 IORING_SETUP_ATTACH_WQ) {
8594 struct fd f;
8595
8596 f = fdget(p->wq_fd);
8597 if (!f.file)
8598 return -ENXIO;
0cc936f7
JA
8599 if (f.file->f_op != &io_uring_fops) {
8600 fdput(f);
f2a48dd0 8601 return -EINVAL;
0cc936f7
JA
8602 }
8603 fdput(f);
d25e3a3d 8604 }
6c271ce2 8605 if (ctx->flags & IORING_SETUP_SQPOLL) {
46fe18b1 8606 struct task_struct *tsk;
534ca6d6 8607 struct io_sq_data *sqd;
26984fbf 8608 bool attached;
534ca6d6 8609
26984fbf 8610 sqd = io_get_sq_data(p, &attached);
534ca6d6
JA
8611 if (IS_ERR(sqd)) {
8612 ret = PTR_ERR(sqd);
8613 goto err;
8614 }
69fb2131 8615
7c30f36a 8616 ctx->sq_creds = get_current_cred();
534ca6d6 8617 ctx->sq_data = sqd;
917257da
JA
8618 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8619 if (!ctx->sq_thread_idle)
8620 ctx->sq_thread_idle = HZ;
8621
78d7f6ba 8622 io_sq_thread_park(sqd);
de75a3d3
PB
8623 list_add(&ctx->sqd_list, &sqd->ctx_list);
8624 io_sqd_update_thread_idle(sqd);
26984fbf 8625 /* don't attach to a dying SQPOLL thread, would be racy */
f2a48dd0 8626 ret = (attached && !sqd->thread) ? -ENXIO : 0;
78d7f6ba
PB
8627 io_sq_thread_unpark(sqd);
8628
de75a3d3
PB
8629 if (ret < 0)
8630 goto err;
8631 if (attached)
5aa75ed5 8632 return 0;
aa06165d 8633
6c271ce2 8634 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 8635 int cpu = p->sq_thread_cpu;
6c271ce2 8636
917257da 8637 ret = -EINVAL;
f2a48dd0 8638 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
e8f98f24 8639 goto err_sqpoll;
37d1e2e3 8640 sqd->sq_cpu = cpu;
6c271ce2 8641 } else {
37d1e2e3 8642 sqd->sq_cpu = -1;
6c271ce2 8643 }
37d1e2e3
JA
8644
8645 sqd->task_pid = current->pid;
5c2469e0 8646 sqd->task_tgid = current->tgid;
46fe18b1
JA
8647 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
8648 if (IS_ERR(tsk)) {
8649 ret = PTR_ERR(tsk);
e8f98f24 8650 goto err_sqpoll;
6c271ce2 8651 }
97a73a0f 8652
46fe18b1 8653 sqd->thread = tsk;
97a73a0f 8654 ret = io_uring_alloc_task_context(tsk, ctx);
46fe18b1 8655 wake_up_new_task(tsk);
0f212204
JA
8656 if (ret)
8657 goto err;
6c271ce2
JA
8658 } else if (p->flags & IORING_SETUP_SQ_AFF) {
8659 /* Can't have SQ_AFF without SQPOLL */
8660 ret = -EINVAL;
8661 goto err;
8662 }
8663
2b188cc1 8664 return 0;
f2a48dd0
PB
8665err_sqpoll:
8666 complete(&ctx->sq_data->exited);
2b188cc1 8667err:
37d1e2e3 8668 io_sq_thread_finish(ctx);
2b188cc1
JA
8669 return ret;
8670}
8671
a087e2b5
BM
8672static inline void __io_unaccount_mem(struct user_struct *user,
8673 unsigned long nr_pages)
2b188cc1
JA
8674{
8675 atomic_long_sub(nr_pages, &user->locked_vm);
8676}
8677
a087e2b5
BM
8678static inline int __io_account_mem(struct user_struct *user,
8679 unsigned long nr_pages)
2b188cc1
JA
8680{
8681 unsigned long page_limit, cur_pages, new_pages;
8682
8683 /* Don't allow more pages than we can safely lock */
8684 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8685
8686 do {
8687 cur_pages = atomic_long_read(&user->locked_vm);
8688 new_pages = cur_pages + nr_pages;
8689 if (new_pages > page_limit)
8690 return -ENOMEM;
8691 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8692 new_pages) != cur_pages);
8693
8694 return 0;
8695}
8696
26bfa89e 8697static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 8698{
62e398be 8699 if (ctx->user)
a087e2b5 8700 __io_unaccount_mem(ctx->user, nr_pages);
30975825 8701
26bfa89e
JA
8702 if (ctx->mm_account)
8703 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
8704}
8705
26bfa89e 8706static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 8707{
30975825
BM
8708 int ret;
8709
62e398be 8710 if (ctx->user) {
30975825
BM
8711 ret = __io_account_mem(ctx->user, nr_pages);
8712 if (ret)
8713 return ret;
8714 }
8715
26bfa89e
JA
8716 if (ctx->mm_account)
8717 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
8718
8719 return 0;
8720}
8721
2b188cc1
JA
8722static void io_mem_free(void *ptr)
8723{
52e04ef4
MR
8724 struct page *page;
8725
8726 if (!ptr)
8727 return;
2b188cc1 8728
52e04ef4 8729 page = virt_to_head_page(ptr);
2b188cc1
JA
8730 if (put_page_testzero(page))
8731 free_compound_page(page);
8732}
8733
8734static void *io_mem_alloc(size_t size)
8735{
8736 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
26bfa89e 8737 __GFP_NORETRY | __GFP_ACCOUNT;
2b188cc1
JA
8738
8739 return (void *) __get_free_pages(gfp_flags, get_order(size));
8740}
8741
75b28aff
HV
8742static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8743 size_t *sq_offset)
8744{
8745 struct io_rings *rings;
8746 size_t off, sq_array_size;
8747
8748 off = struct_size(rings, cqes, cq_entries);
8749 if (off == SIZE_MAX)
8750 return SIZE_MAX;
8751
8752#ifdef CONFIG_SMP
8753 off = ALIGN(off, SMP_CACHE_BYTES);
8754 if (off == 0)
8755 return SIZE_MAX;
8756#endif
8757
b36200f5
DV
8758 if (sq_offset)
8759 *sq_offset = off;
8760
75b28aff
HV
8761 sq_array_size = array_size(sizeof(u32), sq_entries);
8762 if (sq_array_size == SIZE_MAX)
8763 return SIZE_MAX;
8764
8765 if (check_add_overflow(off, sq_array_size, &off))
8766 return SIZE_MAX;
8767
75b28aff
HV
8768 return off;
8769}
8770
41edf1a5 8771static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
7f61a1e9 8772{
41edf1a5 8773 struct io_mapped_ubuf *imu = *slot;
7f61a1e9
PB
8774 unsigned int i;
8775
6224843d
PB
8776 if (imu != ctx->dummy_ubuf) {
8777 for (i = 0; i < imu->nr_bvecs; i++)
8778 unpin_user_page(imu->bvec[i].bv_page);
8779 if (imu->acct_pages)
8780 io_unaccount_mem(ctx, imu->acct_pages);
8781 kvfree(imu);
8782 }
41edf1a5 8783 *slot = NULL;
7f61a1e9
PB
8784}
8785
bd54b6fe 8786static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
edafccee 8787{
634d00df
PB
8788 io_buffer_unmap(ctx, &prsrc->buf);
8789 prsrc->buf = NULL;
bd54b6fe 8790}
edafccee 8791
bd54b6fe
BM
8792static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8793{
8794 unsigned int i;
edafccee 8795
7f61a1e9
PB
8796 for (i = 0; i < ctx->nr_user_bufs; i++)
8797 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
edafccee 8798 kfree(ctx->user_bufs);
bb6659cc 8799 io_rsrc_data_free(ctx->buf_data);
edafccee 8800 ctx->user_bufs = NULL;
bd54b6fe 8801 ctx->buf_data = NULL;
edafccee 8802 ctx->nr_user_bufs = 0;
bd54b6fe
BM
8803}
8804
0a96bbe4 8805static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
edafccee 8806{
bd54b6fe 8807 int ret;
edafccee 8808
bd54b6fe 8809 if (!ctx->buf_data)
edafccee
JA
8810 return -ENXIO;
8811
bd54b6fe
BM
8812 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
8813 if (!ret)
8814 __io_sqe_buffers_unregister(ctx);
8815 return ret;
edafccee
JA
8816}
8817
8818static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8819 void __user *arg, unsigned index)
8820{
8821 struct iovec __user *src;
8822
8823#ifdef CONFIG_COMPAT
8824 if (ctx->compat) {
8825 struct compat_iovec __user *ciovs;
8826 struct compat_iovec ciov;
8827
8828 ciovs = (struct compat_iovec __user *) arg;
8829 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8830 return -EFAULT;
8831
d55e5f5b 8832 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
8833 dst->iov_len = ciov.iov_len;
8834 return 0;
8835 }
8836#endif
8837 src = (struct iovec __user *) arg;
8838 if (copy_from_user(dst, &src[index], sizeof(*dst)))
8839 return -EFAULT;
8840 return 0;
8841}
8842
de293938
JA
8843/*
8844 * Not super efficient, but this is just a registration time. And we do cache
8845 * the last compound head, so generally we'll only do a full search if we don't
8846 * match that one.
8847 *
8848 * We check if the given compound head page has already been accounted, to
8849 * avoid double accounting it. This allows us to account the full size of the
8850 * page, not just the constituent pages of a huge page.
8851 */
8852static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8853 int nr_pages, struct page *hpage)
8854{
8855 int i, j;
8856
8857 /* check current page array */
8858 for (i = 0; i < nr_pages; i++) {
8859 if (!PageCompound(pages[i]))
8860 continue;
8861 if (compound_head(pages[i]) == hpage)
8862 return true;
8863 }
8864
8865 /* check previously registered pages */
8866 for (i = 0; i < ctx->nr_user_bufs; i++) {
41edf1a5 8867 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
de293938
JA
8868
8869 for (j = 0; j < imu->nr_bvecs; j++) {
8870 if (!PageCompound(imu->bvec[j].bv_page))
8871 continue;
8872 if (compound_head(imu->bvec[j].bv_page) == hpage)
8873 return true;
8874 }
8875 }
8876
8877 return false;
8878}
8879
8880static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8881 int nr_pages, struct io_mapped_ubuf *imu,
8882 struct page **last_hpage)
8883{
8884 int i, ret;
8885
216e5835 8886 imu->acct_pages = 0;
de293938
JA
8887 for (i = 0; i < nr_pages; i++) {
8888 if (!PageCompound(pages[i])) {
8889 imu->acct_pages++;
8890 } else {
8891 struct page *hpage;
8892
8893 hpage = compound_head(pages[i]);
8894 if (hpage == *last_hpage)
8895 continue;
8896 *last_hpage = hpage;
8897 if (headpage_already_acct(ctx, pages, i, hpage))
8898 continue;
8899 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8900 }
8901 }
8902
8903 if (!imu->acct_pages)
8904 return 0;
8905
26bfa89e 8906 ret = io_account_mem(ctx, imu->acct_pages);
de293938
JA
8907 if (ret)
8908 imu->acct_pages = 0;
8909 return ret;
8910}
8911
0a96bbe4 8912static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
41edf1a5 8913 struct io_mapped_ubuf **pimu,
0a96bbe4 8914 struct page **last_hpage)
edafccee 8915{
41edf1a5 8916 struct io_mapped_ubuf *imu = NULL;
edafccee
JA
8917 struct vm_area_struct **vmas = NULL;
8918 struct page **pages = NULL;
0a96bbe4
BM
8919 unsigned long off, start, end, ubuf;
8920 size_t size;
8921 int ret, pret, nr_pages, i;
8922
6224843d
PB
8923 if (!iov->iov_base) {
8924 *pimu = ctx->dummy_ubuf;
8925 return 0;
8926 }
8927
0a96bbe4
BM
8928 ubuf = (unsigned long) iov->iov_base;
8929 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8930 start = ubuf >> PAGE_SHIFT;
8931 nr_pages = end - start;
8932
41edf1a5 8933 *pimu = NULL;
0a96bbe4
BM
8934 ret = -ENOMEM;
8935
8936 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8937 if (!pages)
8938 goto done;
8939
8940 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8941 GFP_KERNEL);
8942 if (!vmas)
8943 goto done;
edafccee 8944
41edf1a5 8945 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
a2b4198c 8946 if (!imu)
0a96bbe4
BM
8947 goto done;
8948
8949 ret = 0;
8950 mmap_read_lock(current->mm);
8951 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8952 pages, vmas);
8953 if (pret == nr_pages) {
8954 /* don't support file backed memory */
8955 for (i = 0; i < nr_pages; i++) {
8956 struct vm_area_struct *vma = vmas[i];
8957
40dad765
PB
8958 if (vma_is_shmem(vma))
8959 continue;
0a96bbe4
BM
8960 if (vma->vm_file &&
8961 !is_file_hugepages(vma->vm_file)) {
8962 ret = -EOPNOTSUPP;
8963 break;
8964 }
8965 }
8966 } else {
8967 ret = pret < 0 ? pret : -EFAULT;
8968 }
8969 mmap_read_unlock(current->mm);
8970 if (ret) {
8971 /*
8972 * if we did partial map, or found file backed vmas,
8973 * release any pages we did get
8974 */
8975 if (pret > 0)
8976 unpin_user_pages(pages, pret);
0a96bbe4
BM
8977 goto done;
8978 }
8979
8980 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8981 if (ret) {
8982 unpin_user_pages(pages, pret);
0a96bbe4
BM
8983 goto done;
8984 }
8985
8986 off = ubuf & ~PAGE_MASK;
8987 size = iov->iov_len;
8988 for (i = 0; i < nr_pages; i++) {
8989 size_t vec_len;
8990
8991 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8992 imu->bvec[i].bv_page = pages[i];
8993 imu->bvec[i].bv_len = vec_len;
8994 imu->bvec[i].bv_offset = off;
8995 off = 0;
8996 size -= vec_len;
8997 }
8998 /* store original address for later verification */
8999 imu->ubuf = ubuf;
4751f53d 9000 imu->ubuf_end = ubuf + iov->iov_len;
0a96bbe4 9001 imu->nr_bvecs = nr_pages;
41edf1a5 9002 *pimu = imu;
0a96bbe4
BM
9003 ret = 0;
9004done:
41edf1a5
PB
9005 if (ret)
9006 kvfree(imu);
0a96bbe4
BM
9007 kvfree(pages);
9008 kvfree(vmas);
9009 return ret;
9010}
9011
2b358604 9012static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
0a96bbe4 9013{
87094465
PB
9014 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
9015 return ctx->user_bufs ? 0 : -ENOMEM;
2b358604 9016}
edafccee 9017
2b358604
BM
9018static int io_buffer_validate(struct iovec *iov)
9019{
50e96989
PB
9020 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
9021
2b358604
BM
9022 /*
9023 * Don't impose further limits on the size and buffer
9024 * constraints here, we'll -EINVAL later when IO is
9025 * submitted if they are wrong.
9026 */
6224843d
PB
9027 if (!iov->iov_base)
9028 return iov->iov_len ? -EFAULT : 0;
9029 if (!iov->iov_len)
2b358604 9030 return -EFAULT;
edafccee 9031
2b358604
BM
9032 /* arbitrary limit, but we need something */
9033 if (iov->iov_len > SZ_1G)
9034 return -EFAULT;
edafccee 9035
50e96989
PB
9036 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
9037 return -EOVERFLOW;
9038
2b358604
BM
9039 return 0;
9040}
edafccee 9041
2b358604 9042static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
634d00df 9043 unsigned int nr_args, u64 __user *tags)
2b358604 9044{
bd54b6fe
BM
9045 struct page *last_hpage = NULL;
9046 struct io_rsrc_data *data;
2b358604
BM
9047 int i, ret;
9048 struct iovec iov;
edafccee 9049
87094465
PB
9050 if (ctx->user_bufs)
9051 return -EBUSY;
489809e2 9052 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
87094465 9053 return -EINVAL;
bd54b6fe 9054 ret = io_rsrc_node_switch_start(ctx);
2b358604
BM
9055 if (ret)
9056 return ret;
d878c816
PB
9057 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9058 if (ret)
9059 return ret;
bd54b6fe
BM
9060 ret = io_buffers_map_alloc(ctx, nr_args);
9061 if (ret) {
bb6659cc 9062 io_rsrc_data_free(data);
bd54b6fe
BM
9063 return ret;
9064 }
edafccee 9065
87094465 9066 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
edafccee
JA
9067 ret = io_copy_iov(ctx, &iov, arg, i);
9068 if (ret)
0a96bbe4 9069 break;
2b358604
BM
9070 ret = io_buffer_validate(&iov);
9071 if (ret)
0a96bbe4 9072 break;
2d091d62 9073 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
cf3770e7
CIK
9074 ret = -EINVAL;
9075 break;
9076 }
edafccee 9077
41edf1a5
PB
9078 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9079 &last_hpage);
0a96bbe4
BM
9080 if (ret)
9081 break;
edafccee 9082 }
0a96bbe4 9083
bd54b6fe 9084 WARN_ON_ONCE(ctx->buf_data);
0a96bbe4 9085
bd54b6fe
BM
9086 ctx->buf_data = data;
9087 if (ret)
9088 __io_sqe_buffers_unregister(ctx);
9089 else
9090 io_rsrc_node_switch(ctx, NULL);
edafccee
JA
9091 return ret;
9092}
9093
634d00df
PB
9094static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9095 struct io_uring_rsrc_update2 *up,
9096 unsigned int nr_args)
9097{
9098 u64 __user *tags = u64_to_user_ptr(up->tags);
9099 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
634d00df
PB
9100 struct page *last_hpage = NULL;
9101 bool needs_switch = false;
9102 __u32 done;
9103 int i, err;
9104
9105 if (!ctx->buf_data)
9106 return -ENXIO;
9107 if (up->offset + nr_args > ctx->nr_user_bufs)
9108 return -EINVAL;
9109
9110 for (done = 0; done < nr_args; done++) {
0b8c0e7c
PB
9111 struct io_mapped_ubuf *imu;
9112 int offset = up->offset + done;
634d00df
PB
9113 u64 tag = 0;
9114
9115 err = io_copy_iov(ctx, &iov, iovs, done);
9116 if (err)
9117 break;
9118 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9119 err = -EFAULT;
9120 break;
9121 }
0b8c0e7c
PB
9122 err = io_buffer_validate(&iov);
9123 if (err)
9124 break;
cf3770e7
CIK
9125 if (!iov.iov_base && tag) {
9126 err = -EINVAL;
9127 break;
9128 }
0b8c0e7c
PB
9129 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9130 if (err)
9131 break;
634d00df 9132
0b8c0e7c 9133 i = array_index_nospec(offset, ctx->nr_user_bufs);
6224843d 9134 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
0b8c0e7c
PB
9135 err = io_queue_rsrc_removal(ctx->buf_data, offset,
9136 ctx->rsrc_node, ctx->user_bufs[i]);
9137 if (unlikely(err)) {
9138 io_buffer_unmap(ctx, &imu);
634d00df 9139 break;
0b8c0e7c 9140 }
634d00df
PB
9141 ctx->user_bufs[i] = NULL;
9142 needs_switch = true;
9143 }
9144
0b8c0e7c 9145 ctx->user_bufs[i] = imu;
2d091d62 9146 *io_get_tag_slot(ctx->buf_data, offset) = tag;
634d00df
PB
9147 }
9148
9149 if (needs_switch)
9150 io_rsrc_node_switch(ctx, ctx->buf_data);
9151 return done ? done : err;
9152}
9153
9b402849
JA
9154static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
9155{
9156 __s32 __user *fds = arg;
9157 int fd;
9158
9159 if (ctx->cq_ev_fd)
9160 return -EBUSY;
9161
9162 if (copy_from_user(&fd, fds, sizeof(*fds)))
9163 return -EFAULT;
9164
9165 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
9166 if (IS_ERR(ctx->cq_ev_fd)) {
9167 int ret = PTR_ERR(ctx->cq_ev_fd);
fe7e3257 9168
9b402849
JA
9169 ctx->cq_ev_fd = NULL;
9170 return ret;
9171 }
9172
9173 return 0;
9174}
9175
9176static int io_eventfd_unregister(struct io_ring_ctx *ctx)
9177{
9178 if (ctx->cq_ev_fd) {
9179 eventfd_ctx_put(ctx->cq_ev_fd);
9180 ctx->cq_ev_fd = NULL;
9181 return 0;
9182 }
9183
9184 return -ENXIO;
9185}
9186
5a2e745d
JA
9187static void io_destroy_buffers(struct io_ring_ctx *ctx)
9188{
9e15c3a0
JA
9189 struct io_buffer *buf;
9190 unsigned long index;
9191
8bab4c09 9192 xa_for_each(&ctx->io_buffers, index, buf) {
9e15c3a0 9193 __io_remove_buffers(ctx, buf, index, -1U);
8bab4c09
JA
9194 cond_resched();
9195 }
5a2e745d
JA
9196}
9197
4010fec4 9198static void io_req_caches_free(struct io_ring_ctx *ctx)
2b188cc1 9199{
cd0ca2e0 9200 struct io_submit_state *state = &ctx->submit_state;
37f0e767 9201 int nr = 0;
bf019da7 9202
9a4fdbd8 9203 mutex_lock(&ctx->uring_lock);
cd0ca2e0 9204 io_flush_cached_locked_reqs(ctx, state);
c2b6c6bc
PB
9205
9206 while (state->free_list.next) {
9207 struct io_wq_work_node *node;
9208 struct io_kiocb *req;
9209
9210 node = wq_stack_extract(&state->free_list);
9211 req = container_of(node, struct io_kiocb, comp_list);
9212 kmem_cache_free(req_cachep, req);
37f0e767 9213 nr++;
c2b6c6bc 9214 }
37f0e767
PB
9215 if (nr)
9216 percpu_ref_put_many(&ctx->refs, nr);
9a4fdbd8
JA
9217 mutex_unlock(&ctx->uring_lock);
9218}
9219
43597aac 9220static void io_wait_rsrc_data(struct io_rsrc_data *data)
2b188cc1 9221{
43597aac 9222 if (data && !atomic_dec_and_test(&data->refs))
bd54b6fe 9223 wait_for_completion(&data->done);
bd54b6fe 9224}
04fc6c80 9225
c072481d 9226static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
2b188cc1 9227{
37d1e2e3 9228 io_sq_thread_finish(ctx);
2aede0e4 9229
37d1e2e3 9230 if (ctx->mm_account) {
2aede0e4
JA
9231 mmdrop(ctx->mm_account);
9232 ctx->mm_account = NULL;
30975825 9233 }
def596e9 9234
ab409402 9235 io_rsrc_refs_drop(ctx);
43597aac
PB
9236 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
9237 io_wait_rsrc_data(ctx->buf_data);
9238 io_wait_rsrc_data(ctx->file_data);
9239
8bad28d8 9240 mutex_lock(&ctx->uring_lock);
43597aac 9241 if (ctx->buf_data)
bd54b6fe 9242 __io_sqe_buffers_unregister(ctx);
43597aac 9243 if (ctx->file_data)
08480400 9244 __io_sqe_files_unregister(ctx);
c4ea060e
PB
9245 if (ctx->rings)
9246 __io_cqring_overflow_flush(ctx, true);
8bad28d8 9247 mutex_unlock(&ctx->uring_lock);
9b402849 9248 io_eventfd_unregister(ctx);
5a2e745d 9249 io_destroy_buffers(ctx);
07db298a
PB
9250 if (ctx->sq_creds)
9251 put_cred(ctx->sq_creds);
def596e9 9252
a7f0ed5a
PB
9253 /* there are no registered resources left, nobody uses it */
9254 if (ctx->rsrc_node)
9255 io_rsrc_node_destroy(ctx->rsrc_node);
8dd03afe 9256 if (ctx->rsrc_backup_node)
b895c9a6 9257 io_rsrc_node_destroy(ctx->rsrc_backup_node);
a7f0ed5a 9258 flush_delayed_work(&ctx->rsrc_put_work);
756ab7c0 9259 flush_delayed_work(&ctx->fallback_work);
a7f0ed5a
PB
9260
9261 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
9262 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
def596e9 9263
2b188cc1 9264#if defined(CONFIG_UNIX)
355e8d26
EB
9265 if (ctx->ring_sock) {
9266 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 9267 sock_release(ctx->ring_sock);
355e8d26 9268 }
2b188cc1 9269#endif
ef9dd637 9270 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
2b188cc1 9271
75b28aff 9272 io_mem_free(ctx->rings);
2b188cc1 9273 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
9274
9275 percpu_ref_exit(&ctx->refs);
2b188cc1 9276 free_uid(ctx->user);
4010fec4 9277 io_req_caches_free(ctx);
e941894e
JA
9278 if (ctx->hash_map)
9279 io_wq_put_hash(ctx->hash_map);
78076bb6 9280 kfree(ctx->cancel_hash);
6224843d 9281 kfree(ctx->dummy_ubuf);
2b188cc1
JA
9282 kfree(ctx);
9283}
9284
9285static __poll_t io_uring_poll(struct file *file, poll_table *wait)
9286{
9287 struct io_ring_ctx *ctx = file->private_data;
9288 __poll_t mask = 0;
9289
d60aa65b 9290 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
9291 /*
9292 * synchronizes with barrier from wq_has_sleeper call in
9293 * io_commit_cqring
9294 */
2b188cc1 9295 smp_rmb();
90554200 9296 if (!io_sqring_full(ctx))
2b188cc1 9297 mask |= EPOLLOUT | EPOLLWRNORM;
ed670c3f
HX
9298
9299 /*
9300 * Don't flush cqring overflow list here, just do a simple check.
9301 * Otherwise there could possible be ABBA deadlock:
9302 * CPU0 CPU1
9303 * ---- ----
9304 * lock(&ctx->uring_lock);
9305 * lock(&ep->mtx);
9306 * lock(&ctx->uring_lock);
9307 * lock(&ep->mtx);
9308 *
9309 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
9310 * pushs them to do the flush.
9311 */
5ed7a37d 9312 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
2b188cc1
JA
9313 mask |= EPOLLIN | EPOLLRDNORM;
9314
9315 return mask;
9316}
9317
0bead8cd 9318static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
071698e1 9319{
4379bf8b 9320 const struct cred *creds;
071698e1 9321
61cf9370 9322 creds = xa_erase(&ctx->personalities, id);
4379bf8b
JA
9323 if (creds) {
9324 put_cred(creds);
0bead8cd 9325 return 0;
1e6fa521 9326 }
0bead8cd
YD
9327
9328 return -EINVAL;
9329}
9330
d56d938b
PB
9331struct io_tctx_exit {
9332 struct callback_head task_work;
9333 struct completion completion;
baf186c4 9334 struct io_ring_ctx *ctx;
d56d938b
PB
9335};
9336
c072481d 9337static __cold void io_tctx_exit_cb(struct callback_head *cb)
d56d938b
PB
9338{
9339 struct io_uring_task *tctx = current->io_uring;
9340 struct io_tctx_exit *work;
9341
9342 work = container_of(cb, struct io_tctx_exit, task_work);
9343 /*
9344 * When @in_idle, we're in cancellation and it's racy to remove the
9345 * node. It'll be removed by the end of cancellation, just ignore it.
9346 */
9347 if (!atomic_read(&tctx->in_idle))
eef51daa 9348 io_uring_del_tctx_node((unsigned long)work->ctx);
d56d938b
PB
9349 complete(&work->completion);
9350}
9351
c072481d 9352static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
28090c13
PB
9353{
9354 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9355
9356 return req->ctx == data;
9357}
9358
c072481d 9359static __cold void io_ring_exit_work(struct work_struct *work)
85faa7b8 9360{
d56d938b 9361 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
b5bb3a24 9362 unsigned long timeout = jiffies + HZ * 60 * 5;
58d3be2c 9363 unsigned long interval = HZ / 20;
d56d938b
PB
9364 struct io_tctx_exit exit;
9365 struct io_tctx_node *node;
9366 int ret;
85faa7b8 9367
56952e91
JA
9368 /*
9369 * If we're doing polled IO and end up having requests being
9370 * submitted async (out-of-line), then completions can come in while
9371 * we're waiting for refs to drop. We need to reap these manually,
9372 * as nobody else will be looking for them.
9373 */
b2edc0a7 9374 do {
3dd0c97a 9375 io_uring_try_cancel_requests(ctx, NULL, true);
28090c13
PB
9376 if (ctx->sq_data) {
9377 struct io_sq_data *sqd = ctx->sq_data;
9378 struct task_struct *tsk;
9379
9380 io_sq_thread_park(sqd);
9381 tsk = sqd->thread;
9382 if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
9383 io_wq_cancel_cb(tsk->io_uring->io_wq,
9384 io_cancel_ctx_cb, ctx, true);
9385 io_sq_thread_unpark(sqd);
9386 }
b5bb3a24 9387
37f0e767
PB
9388 io_req_caches_free(ctx);
9389
58d3be2c
PB
9390 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
9391 /* there is little hope left, don't run it too often */
9392 interval = HZ * 60;
9393 }
9394 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
d56d938b 9395
7f00651a
PB
9396 init_completion(&exit.completion);
9397 init_task_work(&exit.task_work, io_tctx_exit_cb);
9398 exit.ctx = ctx;
89b5066e
PB
9399 /*
9400 * Some may use context even when all refs and requests have been put,
9401 * and they are free to do so while still holding uring_lock or
5b0a6acc 9402 * completion_lock, see io_req_task_submit(). Apart from other work,
89b5066e
PB
9403 * this lock/unlock section also waits them to finish.
9404 */
d56d938b
PB
9405 mutex_lock(&ctx->uring_lock);
9406 while (!list_empty(&ctx->tctx_list)) {
b5bb3a24
PB
9407 WARN_ON_ONCE(time_after(jiffies, timeout));
9408
d56d938b
PB
9409 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
9410 ctx_node);
7f00651a
PB
9411 /* don't spin on a single task if cancellation failed */
9412 list_rotate_left(&ctx->tctx_list);
d56d938b
PB
9413 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
9414 if (WARN_ON_ONCE(ret))
9415 continue;
d56d938b
PB
9416
9417 mutex_unlock(&ctx->uring_lock);
9418 wait_for_completion(&exit.completion);
d56d938b
PB
9419 mutex_lock(&ctx->uring_lock);
9420 }
9421 mutex_unlock(&ctx->uring_lock);
79ebeaee
JA
9422 spin_lock(&ctx->completion_lock);
9423 spin_unlock(&ctx->completion_lock);
d56d938b 9424
85faa7b8
JA
9425 io_ring_ctx_free(ctx);
9426}
9427
80c4cbdb 9428/* Returns true if we found and killed one or more timeouts */
c072481d
PB
9429static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
9430 struct task_struct *tsk, bool cancel_all)
80c4cbdb
PB
9431{
9432 struct io_kiocb *req, *tmp;
9433 int canceled = 0;
9434
79ebeaee
JA
9435 spin_lock(&ctx->completion_lock);
9436 spin_lock_irq(&ctx->timeout_lock);
80c4cbdb 9437 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
3dd0c97a 9438 if (io_match_task(req, tsk, cancel_all)) {
80c4cbdb
PB
9439 io_kill_timeout(req, -ECANCELED);
9440 canceled++;
9441 }
9442 }
79ebeaee 9443 spin_unlock_irq(&ctx->timeout_lock);
51520426
PB
9444 if (canceled != 0)
9445 io_commit_cqring(ctx);
79ebeaee 9446 spin_unlock(&ctx->completion_lock);
80c4cbdb
PB
9447 if (canceled != 0)
9448 io_cqring_ev_posted(ctx);
9449 return canceled != 0;
9450}
9451
c072481d 9452static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2b188cc1 9453{
61cf9370
MWO
9454 unsigned long index;
9455 struct creds *creds;
9456
2b188cc1
JA
9457 mutex_lock(&ctx->uring_lock);
9458 percpu_ref_kill(&ctx->refs);
634578f8 9459 if (ctx->rings)
6c2450ae 9460 __io_cqring_overflow_flush(ctx, true);
61cf9370
MWO
9461 xa_for_each(&ctx->personalities, index, creds)
9462 io_unregister_personality(ctx, index);
2b188cc1
JA
9463 mutex_unlock(&ctx->uring_lock);
9464
3dd0c97a
PB
9465 io_kill_timeouts(ctx, NULL, true);
9466 io_poll_remove_all(ctx, NULL, true);
561fb04a 9467
15dff286 9468 /* if we failed setting up the ctx, we might not have any rings */
b2edc0a7 9469 io_iopoll_try_reap_events(ctx);
309fc03a 9470
85faa7b8 9471 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
fc666777
JA
9472 /*
9473 * Use system_unbound_wq to avoid spawning tons of event kworkers
9474 * if we're exiting a ton of rings at the same time. It just adds
9475 * noise and overhead, there's no discernable change in runtime
9476 * over using system_wq.
9477 */
9478 queue_work(system_unbound_wq, &ctx->exit_work);
2b188cc1
JA
9479}
9480
9481static int io_uring_release(struct inode *inode, struct file *file)
9482{
9483 struct io_ring_ctx *ctx = file->private_data;
9484
9485 file->private_data = NULL;
9486 io_ring_ctx_wait_and_kill(ctx);
9487 return 0;
9488}
9489
f6edbabb
PB
9490struct io_task_cancel {
9491 struct task_struct *task;
3dd0c97a 9492 bool all;
f6edbabb 9493};
f254ac04 9494
f6edbabb 9495static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
b711d4ea 9496{
9a472ef7 9497 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f6edbabb 9498 struct io_task_cancel *cancel = data;
9a472ef7
PB
9499 bool ret;
9500
3dd0c97a 9501 if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
9a472ef7
PB
9502 struct io_ring_ctx *ctx = req->ctx;
9503
9504 /* protect against races with linked timeouts */
79ebeaee 9505 spin_lock(&ctx->completion_lock);
3dd0c97a 9506 ret = io_match_task(req, cancel->task, cancel->all);
79ebeaee 9507 spin_unlock(&ctx->completion_lock);
9a472ef7 9508 } else {
3dd0c97a 9509 ret = io_match_task(req, cancel->task, cancel->all);
9a472ef7
PB
9510 }
9511 return ret;
b711d4ea
JA
9512}
9513
c072481d
PB
9514static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
9515 struct task_struct *task,
9516 bool cancel_all)
b7ddce3c 9517{
e1915f76 9518 struct io_defer_entry *de;
b7ddce3c
PB
9519 LIST_HEAD(list);
9520
79ebeaee 9521 spin_lock(&ctx->completion_lock);
b7ddce3c 9522 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
3dd0c97a 9523 if (io_match_task(de->req, task, cancel_all)) {
b7ddce3c
PB
9524 list_cut_position(&list, &ctx->defer_list, &de->list);
9525 break;
9526 }
9527 }
79ebeaee 9528 spin_unlock(&ctx->completion_lock);
e1915f76
PB
9529 if (list_empty(&list))
9530 return false;
b7ddce3c
PB
9531
9532 while (!list_empty(&list)) {
9533 de = list_first_entry(&list, struct io_defer_entry, list);
9534 list_del_init(&de->list);
f41db273 9535 io_req_complete_failed(de->req, -ECANCELED);
b7ddce3c
PB
9536 kfree(de);
9537 }
e1915f76 9538 return true;
b7ddce3c
PB
9539}
9540
c072481d 9541static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
1b00764f
PB
9542{
9543 struct io_tctx_node *node;
9544 enum io_wq_cancel cret;
9545 bool ret = false;
9546
9547 mutex_lock(&ctx->uring_lock);
9548 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
9549 struct io_uring_task *tctx = node->task->io_uring;
9550
9551 /*
9552 * io_wq will stay alive while we hold uring_lock, because it's
9553 * killed after ctx nodes, which requires to take the lock.
9554 */
9555 if (!tctx || !tctx->io_wq)
9556 continue;
9557 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
9558 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9559 }
9560 mutex_unlock(&ctx->uring_lock);
9561
9562 return ret;
9563}
9564
c072481d
PB
9565static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
9566 struct task_struct *task,
9567 bool cancel_all)
9936c7c2 9568{
3dd0c97a 9569 struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
1b00764f 9570 struct io_uring_task *tctx = task ? task->io_uring : NULL;
9936c7c2
PB
9571
9572 while (1) {
9573 enum io_wq_cancel cret;
9574 bool ret = false;
9575
1b00764f
PB
9576 if (!task) {
9577 ret |= io_uring_try_cancel_iowq(ctx);
9578 } else if (tctx && tctx->io_wq) {
9579 /*
9580 * Cancels requests of all rings, not only @ctx, but
9581 * it's fine as the task is in exit/exec.
9582 */
5aa75ed5 9583 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9936c7c2
PB
9584 &cancel, true);
9585 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9586 }
9587
9588 /* SQPOLL thread does its own polling */
3dd0c97a 9589 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
d052d1d6 9590 (ctx->sq_data && ctx->sq_data->thread == current)) {
5eef4e87 9591 while (!wq_list_empty(&ctx->iopoll_list)) {
9936c7c2
PB
9592 io_iopoll_try_reap_events(ctx);
9593 ret = true;
9594 }
9595 }
9596
3dd0c97a
PB
9597 ret |= io_cancel_defer_files(ctx, task, cancel_all);
9598 ret |= io_poll_remove_all(ctx, task, cancel_all);
9599 ret |= io_kill_timeouts(ctx, task, cancel_all);
e5dc480d
PB
9600 if (task)
9601 ret |= io_run_task_work();
9936c7c2
PB
9602 if (!ret)
9603 break;
9604 cond_resched();
9605 }
9606}
9607
eef51daa 9608static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
0f212204 9609{
236434c3 9610 struct io_uring_task *tctx = current->io_uring;
13bf43f5 9611 struct io_tctx_node *node;
a528b04e 9612 int ret;
236434c3
MWO
9613
9614 if (unlikely(!tctx)) {
5aa75ed5 9615 ret = io_uring_alloc_task_context(current, ctx);
0f212204
JA
9616 if (unlikely(ret))
9617 return ret;
236434c3 9618 tctx = current->io_uring;
0f212204 9619 }
cf27f3b1
PB
9620 if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
9621 node = kmalloc(sizeof(*node), GFP_KERNEL);
9622 if (!node)
9623 return -ENOMEM;
9624 node->ctx = ctx;
9625 node->task = current;
13bf43f5 9626
cf27f3b1
PB
9627 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
9628 node, GFP_KERNEL));
9629 if (ret) {
9630 kfree(node);
9631 return ret;
0f212204 9632 }
cf27f3b1
PB
9633
9634 mutex_lock(&ctx->uring_lock);
9635 list_add(&node->ctx_node, &ctx->tctx_list);
9636 mutex_unlock(&ctx->uring_lock);
0f212204 9637 }
cf27f3b1 9638 tctx->last = ctx;
0f212204
JA
9639 return 0;
9640}
9641
cf27f3b1
PB
9642/*
9643 * Note that this task has used io_uring. We use it for cancelation purposes.
9644 */
eef51daa 9645static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
cf27f3b1
PB
9646{
9647 struct io_uring_task *tctx = current->io_uring;
9648
9649 if (likely(tctx && tctx->last == ctx))
9650 return 0;
eef51daa 9651 return __io_uring_add_tctx_node(ctx);
cf27f3b1
PB
9652}
9653
0f212204
JA
9654/*
9655 * Remove this io_uring_file -> task mapping.
9656 */
c072481d 9657static __cold void io_uring_del_tctx_node(unsigned long index)
0f212204
JA
9658{
9659 struct io_uring_task *tctx = current->io_uring;
13bf43f5 9660 struct io_tctx_node *node;
2941267b 9661
eebd2e37
PB
9662 if (!tctx)
9663 return;
13bf43f5
PB
9664 node = xa_erase(&tctx->xa, index);
9665 if (!node)
2941267b 9666 return;
0f212204 9667
13bf43f5
PB
9668 WARN_ON_ONCE(current != node->task);
9669 WARN_ON_ONCE(list_empty(&node->ctx_node));
9670
9671 mutex_lock(&node->ctx->uring_lock);
9672 list_del(&node->ctx_node);
9673 mutex_unlock(&node->ctx->uring_lock);
9674
baf186c4 9675 if (tctx->last == node->ctx)
0f212204 9676 tctx->last = NULL;
13bf43f5 9677 kfree(node);
0f212204
JA
9678}
9679
c072481d 9680static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
de7f1d9e 9681{
ba5ef6dc 9682 struct io_wq *wq = tctx->io_wq;
13bf43f5 9683 struct io_tctx_node *node;
de7f1d9e
PB
9684 unsigned long index;
9685
8bab4c09 9686 xa_for_each(&tctx->xa, index, node) {
eef51daa 9687 io_uring_del_tctx_node(index);
8bab4c09
JA
9688 cond_resched();
9689 }
b16ef427
ME
9690 if (wq) {
9691 /*
9692 * Must be after io_uring_del_task_file() (removes nodes under
9693 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
9694 */
ba5ef6dc 9695 io_wq_put_and_exit(wq);
dadebc35 9696 tctx->io_wq = NULL;
b16ef427 9697 }
de7f1d9e
PB
9698}
9699
3f48cf18 9700static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
521d6a73 9701{
3f48cf18
PB
9702 if (tracked)
9703 return atomic_read(&tctx->inflight_tracked);
521d6a73
PB
9704 return percpu_counter_sum(&tctx->inflight);
9705}
9706
c072481d 9707static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
09899b19
PB
9708{
9709 struct io_uring_task *tctx = task->io_uring;
9710 unsigned int refs = tctx->cached_refs;
9711
e9dbe221
PB
9712 if (refs) {
9713 tctx->cached_refs = 0;
9714 percpu_counter_sub(&tctx->inflight, refs);
9715 put_task_struct_many(task, refs);
9716 }
09899b19
PB
9717}
9718
78cc687b
PB
9719/*
9720 * Find any io_uring ctx that this task has registered or done IO on, and cancel
9721 * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
9722 */
c072481d
PB
9723static __cold void io_uring_cancel_generic(bool cancel_all,
9724 struct io_sq_data *sqd)
0e9ddb39 9725{
521d6a73 9726 struct io_uring_task *tctx = current->io_uring;
734551df 9727 struct io_ring_ctx *ctx;
0e9ddb39
PB
9728 s64 inflight;
9729 DEFINE_WAIT(wait);
fdaf083c 9730
78cc687b
PB
9731 WARN_ON_ONCE(sqd && sqd->thread != current);
9732
6d042ffb
PO
9733 if (!current->io_uring)
9734 return;
17a91051
PB
9735 if (tctx->io_wq)
9736 io_wq_exit_start(tctx->io_wq);
9737
0e9ddb39
PB
9738 atomic_inc(&tctx->in_idle);
9739 do {
e9dbe221 9740 io_uring_drop_tctx_refs(current);
0e9ddb39 9741 /* read completions before cancelations */
78cc687b 9742 inflight = tctx_inflight(tctx, !cancel_all);
0e9ddb39
PB
9743 if (!inflight)
9744 break;
fdaf083c 9745
78cc687b
PB
9746 if (!sqd) {
9747 struct io_tctx_node *node;
9748 unsigned long index;
0f212204 9749
78cc687b
PB
9750 xa_for_each(&tctx->xa, index, node) {
9751 /* sqpoll task will cancel all its requests */
9752 if (node->ctx->sq_data)
9753 continue;
9754 io_uring_try_cancel_requests(node->ctx, current,
9755 cancel_all);
9756 }
9757 } else {
9758 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
9759 io_uring_try_cancel_requests(ctx, current,
9760 cancel_all);
9761 }
17a91051 9762
0f212204 9763 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
e9dbe221 9764 io_uring_drop_tctx_refs(current);
0f212204 9765 /*
a1bb3cd5
PB
9766 * If we've seen completions, retry without waiting. This
9767 * avoids a race where a completion comes in before we did
9768 * prepare_to_wait().
0f212204 9769 */
3dd0c97a 9770 if (inflight == tctx_inflight(tctx, !cancel_all))
a1bb3cd5 9771 schedule();
f57555ed 9772 finish_wait(&tctx->wait, &wait);
d8a6df10 9773 } while (1);
fdaf083c 9774 atomic_dec(&tctx->in_idle);
de7f1d9e 9775
8452d4a6 9776 io_uring_clean_tctx(tctx);
3dd0c97a 9777 if (cancel_all) {
3f48cf18
PB
9778 /* for exec all current's requests should be gone, kill tctx */
9779 __io_uring_free(current);
9780 }
44e728b8
PB
9781}
9782
f552a27a 9783void __io_uring_cancel(bool cancel_all)
78cc687b 9784{
f552a27a 9785 io_uring_cancel_generic(cancel_all, NULL);
78cc687b
PB
9786}
9787
6c5c240e
RP
9788static void *io_uring_validate_mmap_request(struct file *file,
9789 loff_t pgoff, size_t sz)
2b188cc1 9790{
2b188cc1 9791 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 9792 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
9793 struct page *page;
9794 void *ptr;
9795
9796 switch (offset) {
9797 case IORING_OFF_SQ_RING:
75b28aff
HV
9798 case IORING_OFF_CQ_RING:
9799 ptr = ctx->rings;
2b188cc1
JA
9800 break;
9801 case IORING_OFF_SQES:
9802 ptr = ctx->sq_sqes;
9803 break;
2b188cc1 9804 default:
6c5c240e 9805 return ERR_PTR(-EINVAL);
2b188cc1
JA
9806 }
9807
9808 page = virt_to_head_page(ptr);
a50b854e 9809 if (sz > page_size(page))
6c5c240e
RP
9810 return ERR_PTR(-EINVAL);
9811
9812 return ptr;
9813}
9814
9815#ifdef CONFIG_MMU
9816
c072481d 9817static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6c5c240e
RP
9818{
9819 size_t sz = vma->vm_end - vma->vm_start;
9820 unsigned long pfn;
9821 void *ptr;
9822
9823 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9824 if (IS_ERR(ptr))
9825 return PTR_ERR(ptr);
2b188cc1
JA
9826
9827 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9828 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9829}
9830
6c5c240e
RP
9831#else /* !CONFIG_MMU */
9832
9833static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9834{
9835 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9836}
9837
9838static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9839{
9840 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9841}
9842
9843static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9844 unsigned long addr, unsigned long len,
9845 unsigned long pgoff, unsigned long flags)
9846{
9847 void *ptr;
9848
9849 ptr = io_uring_validate_mmap_request(file, pgoff, len);
9850 if (IS_ERR(ptr))
9851 return PTR_ERR(ptr);
9852
9853 return (unsigned long) ptr;
9854}
9855
9856#endif /* !CONFIG_MMU */
9857
d9d05217 9858static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
90554200
JA
9859{
9860 DEFINE_WAIT(wait);
9861
9862 do {
9863 if (!io_sqring_full(ctx))
9864 break;
90554200
JA
9865 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9866
9867 if (!io_sqring_full(ctx))
9868 break;
90554200
JA
9869 schedule();
9870 } while (!signal_pending(current));
9871
9872 finish_wait(&ctx->sqo_sq_wait, &wait);
5199328a 9873 return 0;
90554200
JA
9874}
9875
c73ebb68
HX
9876static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9877 struct __kernel_timespec __user **ts,
9878 const sigset_t __user **sig)
9879{
9880 struct io_uring_getevents_arg arg;
9881
9882 /*
9883 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9884 * is just a pointer to the sigset_t.
9885 */
9886 if (!(flags & IORING_ENTER_EXT_ARG)) {
9887 *sig = (const sigset_t __user *) argp;
9888 *ts = NULL;
9889 return 0;
9890 }
9891
9892 /*
9893 * EXT_ARG is set - ensure we agree on the size of it and copy in our
9894 * timespec and sigset_t pointers if good.
9895 */
9896 if (*argsz != sizeof(arg))
9897 return -EINVAL;
9898 if (copy_from_user(&arg, argp, sizeof(arg)))
9899 return -EFAULT;
9900 *sig = u64_to_user_ptr(arg.sigmask);
9901 *argsz = arg.sigmask_sz;
9902 *ts = u64_to_user_ptr(arg.ts);
9903 return 0;
9904}
9905
2b188cc1 9906SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
c73ebb68
HX
9907 u32, min_complete, u32, flags, const void __user *, argp,
9908 size_t, argsz)
2b188cc1
JA
9909{
9910 struct io_ring_ctx *ctx;
2b188cc1
JA
9911 int submitted = 0;
9912 struct fd f;
33f993da 9913 long ret;
2b188cc1 9914
4c6e277c 9915 io_run_task_work();
b41e9852 9916
33f993da
PB
9917 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9918 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
2b188cc1
JA
9919 return -EINVAL;
9920
9921 f = fdget(fd);
33f993da 9922 if (unlikely(!f.file))
2b188cc1
JA
9923 return -EBADF;
9924
9925 ret = -EOPNOTSUPP;
33f993da 9926 if (unlikely(f.file->f_op != &io_uring_fops))
2b188cc1
JA
9927 goto out_fput;
9928
9929 ret = -ENXIO;
9930 ctx = f.file->private_data;
33f993da 9931 if (unlikely(!percpu_ref_tryget(&ctx->refs)))
2b188cc1
JA
9932 goto out_fput;
9933
7e84e1c7 9934 ret = -EBADFD;
33f993da 9935 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
7e84e1c7
SG
9936 goto out;
9937
6c271ce2
JA
9938 /*
9939 * For SQ polling, the thread will do all submissions and completions.
9940 * Just return the requested submit count, and wake the thread if
9941 * we were asked to.
9942 */
b2a9eada 9943 ret = 0;
6c271ce2 9944 if (ctx->flags & IORING_SETUP_SQPOLL) {
90f67366 9945 io_cqring_overflow_flush(ctx);
89448c47 9946
21f96522
JA
9947 if (unlikely(ctx->sq_data->thread == NULL)) {
9948 ret = -EOWNERDEAD;
04147488 9949 goto out;
21f96522 9950 }
6c271ce2 9951 if (flags & IORING_ENTER_SQ_WAKEUP)
534ca6d6 9952 wake_up(&ctx->sq_data->wait);
d9d05217
PB
9953 if (flags & IORING_ENTER_SQ_WAIT) {
9954 ret = io_sqpoll_wait_sq(ctx);
9955 if (ret)
9956 goto out;
9957 }
6c271ce2 9958 submitted = to_submit;
b2a9eada 9959 } else if (to_submit) {
eef51daa 9960 ret = io_uring_add_tctx_node(ctx);
0f212204
JA
9961 if (unlikely(ret))
9962 goto out;
2b188cc1 9963 mutex_lock(&ctx->uring_lock);
0f212204 9964 submitted = io_submit_sqes(ctx, to_submit);
2b188cc1 9965 mutex_unlock(&ctx->uring_lock);
7c504e65
PB
9966
9967 if (submitted != to_submit)
9968 goto out;
2b188cc1
JA
9969 }
9970 if (flags & IORING_ENTER_GETEVENTS) {
c73ebb68
HX
9971 const sigset_t __user *sig;
9972 struct __kernel_timespec __user *ts;
9973
9974 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9975 if (unlikely(ret))
9976 goto out;
9977
2b188cc1
JA
9978 min_complete = min(min_complete, ctx->cq_entries);
9979
32b2244a
XW
9980 /*
9981 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9982 * space applications don't need to do io completion events
9983 * polling again, they can rely on io_sq_thread to do polling
9984 * work, which can reduce cpu usage and uring_lock contention.
9985 */
9986 if (ctx->flags & IORING_SETUP_IOPOLL &&
9987 !(ctx->flags & IORING_SETUP_SQPOLL)) {
7668b92a 9988 ret = io_iopoll_check(ctx, min_complete);
def596e9 9989 } else {
c73ebb68 9990 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
def596e9 9991 }
2b188cc1
JA
9992 }
9993
7c504e65 9994out:
6805b32e 9995 percpu_ref_put(&ctx->refs);
2b188cc1
JA
9996out_fput:
9997 fdput(f);
9998 return submitted ? submitted : ret;
9999}
10000
bebdb65e 10001#ifdef CONFIG_PROC_FS
c072481d 10002static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
61cf9370 10003 const struct cred *cred)
87ce955b 10004{
87ce955b
JA
10005 struct user_namespace *uns = seq_user_ns(m);
10006 struct group_info *gi;
10007 kernel_cap_t cap;
10008 unsigned __capi;
10009 int g;
10010
10011 seq_printf(m, "%5d\n", id);
10012 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10013 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10014 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10015 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10016 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10017 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10018 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10019 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10020 seq_puts(m, "\n\tGroups:\t");
10021 gi = cred->group_info;
10022 for (g = 0; g < gi->ngroups; g++) {
10023 seq_put_decimal_ull(m, g ? " " : "",
10024 from_kgid_munged(uns, gi->gid[g]));
10025 }
10026 seq_puts(m, "\n\tCapEff:\t");
10027 cap = cred->cap_effective;
10028 CAP_FOR_EACH_U32(__capi)
10029 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10030 seq_putc(m, '\n');
10031 return 0;
10032}
10033
c072481d
PB
10034static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
10035 struct seq_file *m)
87ce955b 10036{
dbbe9c64 10037 struct io_sq_data *sq = NULL;
83f84356
HX
10038 struct io_overflow_cqe *ocqe;
10039 struct io_rings *r = ctx->rings;
10040 unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
10041 unsigned int cached_sq_head = ctx->cached_sq_head;
10042 unsigned int cached_cq_tail = ctx->cached_cq_tail;
10043 unsigned int sq_head = READ_ONCE(r->sq.head);
10044 unsigned int sq_tail = READ_ONCE(r->sq.tail);
10045 unsigned int cq_head = READ_ONCE(r->cq.head);
10046 unsigned int cq_tail = READ_ONCE(r->cq.tail);
fad8e0de 10047 bool has_lock;
83f84356
HX
10048 unsigned int i;
10049
10050 /*
10051 * we may get imprecise sqe and cqe info if uring is actively running
10052 * since we get cached_sq_head and cached_cq_tail without uring_lock
10053 * and sq_tail and cq_head are changed by userspace. But it's ok since
10054 * we usually use these info when it is stuck.
10055 */
10056 seq_printf(m, "SqHead:\t%u\n", sq_head & sq_mask);
10057 seq_printf(m, "SqTail:\t%u\n", sq_tail & sq_mask);
10058 seq_printf(m, "CachedSqHead:\t%u\n", cached_sq_head & sq_mask);
10059 seq_printf(m, "CqHead:\t%u\n", cq_head & cq_mask);
10060 seq_printf(m, "CqTail:\t%u\n", cq_tail & cq_mask);
10061 seq_printf(m, "CachedCqTail:\t%u\n", cached_cq_tail & cq_mask);
10062 seq_printf(m, "SQEs:\t%u\n", sq_tail - cached_sq_head);
10063 for (i = cached_sq_head; i < sq_tail; i++) {
10064 unsigned int sq_idx = READ_ONCE(ctx->sq_array[i & sq_mask]);
10065
10066 if (likely(sq_idx <= sq_mask)) {
10067 struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx];
10068
10069 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10070 sq_idx, sqe->opcode, sqe->fd, sqe->flags, sqe->user_data);
10071 }
10072 }
10073 seq_printf(m, "CQEs:\t%u\n", cached_cq_tail - cq_head);
10074 for (i = cq_head; i < cached_cq_tail; i++) {
10075 struct io_uring_cqe *cqe = &r->cqes[i & cq_mask];
10076
10077 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
10078 i & cq_mask, cqe->user_data, cqe->res, cqe->flags);
10079 }
87ce955b 10080
fad8e0de
JA
10081 /*
10082 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10083 * since fdinfo case grabs it in the opposite direction of normal use
10084 * cases. If we fail to get the lock, we just don't iterate any
10085 * structures that could be going away outside the io_uring mutex.
10086 */
10087 has_lock = mutex_trylock(&ctx->uring_lock);
10088
5f3f26f9 10089 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
dbbe9c64 10090 sq = ctx->sq_data;
5f3f26f9
JA
10091 if (!sq->thread)
10092 sq = NULL;
10093 }
dbbe9c64
JQ
10094
10095 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10096 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
87ce955b 10097 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
fad8e0de 10098 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
7b29f92d 10099 struct file *f = io_file_from_index(ctx, i);
87ce955b 10100
87ce955b
JA
10101 if (f)
10102 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10103 else
10104 seq_printf(m, "%5u: <none>\n", i);
10105 }
10106 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
fad8e0de 10107 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
41edf1a5 10108 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
4751f53d 10109 unsigned int len = buf->ubuf_end - buf->ubuf;
87ce955b 10110
4751f53d 10111 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
87ce955b 10112 }
61cf9370
MWO
10113 if (has_lock && !xa_empty(&ctx->personalities)) {
10114 unsigned long index;
10115 const struct cred *cred;
10116
87ce955b 10117 seq_printf(m, "Personalities:\n");
61cf9370
MWO
10118 xa_for_each(&ctx->personalities, index, cred)
10119 io_uring_show_cred(m, index, cred);
87ce955b 10120 }
83f84356
HX
10121 if (has_lock)
10122 mutex_unlock(&ctx->uring_lock);
10123
10124 seq_puts(m, "PollList:\n");
79ebeaee 10125 spin_lock(&ctx->completion_lock);
d7718a9d
JA
10126 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10127 struct hlist_head *list = &ctx->cancel_hash[i];
10128 struct io_kiocb *req;
10129
10130 hlist_for_each_entry(req, list, hash_node)
10131 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
10132 req->task->task_works != NULL);
10133 }
83f84356
HX
10134
10135 seq_puts(m, "CqOverflowList:\n");
10136 list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10137 struct io_uring_cqe *cqe = &ocqe->cqe;
10138
10139 seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
10140 cqe->user_data, cqe->res, cqe->flags);
10141
10142 }
10143
79ebeaee 10144 spin_unlock(&ctx->completion_lock);
87ce955b
JA
10145}
10146
c072481d 10147static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
87ce955b
JA
10148{
10149 struct io_ring_ctx *ctx = f->private_data;
10150
10151 if (percpu_ref_tryget(&ctx->refs)) {
10152 __io_uring_show_fdinfo(ctx, m);
10153 percpu_ref_put(&ctx->refs);
10154 }
10155}
bebdb65e 10156#endif
87ce955b 10157
2b188cc1
JA
10158static const struct file_operations io_uring_fops = {
10159 .release = io_uring_release,
10160 .mmap = io_uring_mmap,
6c5c240e
RP
10161#ifndef CONFIG_MMU
10162 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
10163 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
10164#endif
2b188cc1 10165 .poll = io_uring_poll,
bebdb65e 10166#ifdef CONFIG_PROC_FS
87ce955b 10167 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 10168#endif
2b188cc1
JA
10169};
10170
c072481d
PB
10171static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10172 struct io_uring_params *p)
2b188cc1 10173{
75b28aff
HV
10174 struct io_rings *rings;
10175 size_t size, sq_array_offset;
2b188cc1 10176
bd740481
JA
10177 /* make sure these are sane, as we already accounted them */
10178 ctx->sq_entries = p->sq_entries;
10179 ctx->cq_entries = p->cq_entries;
10180
75b28aff
HV
10181 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
10182 if (size == SIZE_MAX)
10183 return -EOVERFLOW;
10184
10185 rings = io_mem_alloc(size);
10186 if (!rings)
2b188cc1
JA
10187 return -ENOMEM;
10188
75b28aff
HV
10189 ctx->rings = rings;
10190 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
10191 rings->sq_ring_mask = p->sq_entries - 1;
10192 rings->cq_ring_mask = p->cq_entries - 1;
10193 rings->sq_ring_entries = p->sq_entries;
10194 rings->cq_ring_entries = p->cq_entries;
2b188cc1
JA
10195
10196 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
10197 if (size == SIZE_MAX) {
10198 io_mem_free(ctx->rings);
10199 ctx->rings = NULL;
2b188cc1 10200 return -EOVERFLOW;
eb065d30 10201 }
2b188cc1
JA
10202
10203 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
10204 if (!ctx->sq_sqes) {
10205 io_mem_free(ctx->rings);
10206 ctx->rings = NULL;
2b188cc1 10207 return -ENOMEM;
eb065d30 10208 }
2b188cc1 10209
2b188cc1
JA
10210 return 0;
10211}
10212
9faadcc8
PB
10213static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
10214{
10215 int ret, fd;
10216
10217 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
10218 if (fd < 0)
10219 return fd;
10220
eef51daa 10221 ret = io_uring_add_tctx_node(ctx);
9faadcc8
PB
10222 if (ret) {
10223 put_unused_fd(fd);
10224 return ret;
10225 }
10226 fd_install(fd, file);
10227 return fd;
10228}
10229
2b188cc1
JA
10230/*
10231 * Allocate an anonymous fd, this is what constitutes the application
10232 * visible backing of an io_uring instance. The application mmaps this
10233 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
10234 * we have to tie this fd to a socket for file garbage collection purposes.
10235 */
9faadcc8 10236static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
2b188cc1
JA
10237{
10238 struct file *file;
9faadcc8 10239#if defined(CONFIG_UNIX)
2b188cc1
JA
10240 int ret;
10241
2b188cc1
JA
10242 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
10243 &ctx->ring_sock);
10244 if (ret)
9faadcc8 10245 return ERR_PTR(ret);
2b188cc1
JA
10246#endif
10247
2b188cc1
JA
10248 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
10249 O_RDWR | O_CLOEXEC);
2b188cc1 10250#if defined(CONFIG_UNIX)
9faadcc8
PB
10251 if (IS_ERR(file)) {
10252 sock_release(ctx->ring_sock);
10253 ctx->ring_sock = NULL;
10254 } else {
10255 ctx->ring_sock->file = file;
0f212204 10256 }
2b188cc1 10257#endif
9faadcc8 10258 return file;
2b188cc1
JA
10259}
10260
c072481d
PB
10261static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
10262 struct io_uring_params __user *params)
2b188cc1 10263{
2b188cc1 10264 struct io_ring_ctx *ctx;
9faadcc8 10265 struct file *file;
2b188cc1
JA
10266 int ret;
10267
8110c1a6 10268 if (!entries)
2b188cc1 10269 return -EINVAL;
8110c1a6
JA
10270 if (entries > IORING_MAX_ENTRIES) {
10271 if (!(p->flags & IORING_SETUP_CLAMP))
10272 return -EINVAL;
10273 entries = IORING_MAX_ENTRIES;
10274 }
2b188cc1
JA
10275
10276 /*
10277 * Use twice as many entries for the CQ ring. It's possible for the
10278 * application to drive a higher depth than the size of the SQ ring,
10279 * since the sqes are only used at submission time. This allows for
33a107f0
JA
10280 * some flexibility in overcommitting a bit. If the application has
10281 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
10282 * of CQ ring entries manually.
2b188cc1
JA
10283 */
10284 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
10285 if (p->flags & IORING_SETUP_CQSIZE) {
10286 /*
10287 * If IORING_SETUP_CQSIZE is set, we do the same roundup
10288 * to a power-of-two, if it isn't already. We do NOT impose
10289 * any cq vs sq ring sizing.
10290 */
eb2667b3 10291 if (!p->cq_entries)
33a107f0 10292 return -EINVAL;
8110c1a6
JA
10293 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
10294 if (!(p->flags & IORING_SETUP_CLAMP))
10295 return -EINVAL;
10296 p->cq_entries = IORING_MAX_CQ_ENTRIES;
10297 }
eb2667b3
JQ
10298 p->cq_entries = roundup_pow_of_two(p->cq_entries);
10299 if (p->cq_entries < p->sq_entries)
10300 return -EINVAL;
33a107f0
JA
10301 } else {
10302 p->cq_entries = 2 * p->sq_entries;
10303 }
2b188cc1 10304
2b188cc1 10305 ctx = io_ring_ctx_alloc(p);
62e398be 10306 if (!ctx)
2b188cc1 10307 return -ENOMEM;
2b188cc1 10308 ctx->compat = in_compat_syscall();
62e398be
JA
10309 if (!capable(CAP_IPC_LOCK))
10310 ctx->user = get_uid(current_user());
2aede0e4
JA
10311
10312 /*
10313 * This is just grabbed for accounting purposes. When a process exits,
10314 * the mm is exited and dropped before the files, hence we need to hang
10315 * on to this mm purely for the purposes of being able to unaccount
10316 * memory (locked/pinned vm). It's not used for anything else.
10317 */
6b7898eb 10318 mmgrab(current->mm);
2aede0e4 10319 ctx->mm_account = current->mm;
6b7898eb 10320
2b188cc1
JA
10321 ret = io_allocate_scq_urings(ctx, p);
10322 if (ret)
10323 goto err;
10324
7e84e1c7 10325 ret = io_sq_offload_create(ctx, p);
2b188cc1
JA
10326 if (ret)
10327 goto err;
eae071c9 10328 /* always set a rsrc node */
47b228ce
PB
10329 ret = io_rsrc_node_switch_start(ctx);
10330 if (ret)
10331 goto err;
eae071c9 10332 io_rsrc_node_switch(ctx, NULL);
2b188cc1 10333
2b188cc1 10334 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
10335 p->sq_off.head = offsetof(struct io_rings, sq.head);
10336 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
10337 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
10338 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
10339 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
10340 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
10341 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
10342
10343 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
10344 p->cq_off.head = offsetof(struct io_rings, cq.head);
10345 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
10346 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
10347 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
10348 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
10349 p->cq_off.cqes = offsetof(struct io_rings, cqes);
0d9b5b3a 10350 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
ac90f249 10351
7f13657d
XW
10352 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
10353 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
5769a351 10354 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
c73ebb68 10355 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9690557e
PB
10356 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
10357 IORING_FEAT_RSRC_TAGS;
7f13657d
XW
10358
10359 if (copy_to_user(params, p, sizeof(*p))) {
10360 ret = -EFAULT;
10361 goto err;
10362 }
d1719f70 10363
9faadcc8
PB
10364 file = io_uring_get_file(ctx);
10365 if (IS_ERR(file)) {
10366 ret = PTR_ERR(file);
10367 goto err;
10368 }
10369
044c1ab3
JA
10370 /*
10371 * Install ring fd as the very last thing, so we don't risk someone
10372 * having closed it before we finish setup
10373 */
9faadcc8
PB
10374 ret = io_uring_install_fd(ctx, file);
10375 if (ret < 0) {
10376 /* fput will clean it up */
10377 fput(file);
10378 return ret;
10379 }
044c1ab3 10380
c826bd7a 10381 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
10382 return ret;
10383err:
10384 io_ring_ctx_wait_and_kill(ctx);
10385 return ret;
10386}
10387
10388/*
10389 * Sets up an aio uring context, and returns the fd. Applications asks for a
10390 * ring size, we return the actual sq/cq ring sizes (among other things) in the
10391 * params structure passed in.
10392 */
10393static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
10394{
10395 struct io_uring_params p;
2b188cc1
JA
10396 int i;
10397
10398 if (copy_from_user(&p, params, sizeof(p)))
10399 return -EFAULT;
10400 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
10401 if (p.resv[i])
10402 return -EINVAL;
10403 }
10404
6c271ce2 10405 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 10406 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7e84e1c7
SG
10407 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
10408 IORING_SETUP_R_DISABLED))
2b188cc1
JA
10409 return -EINVAL;
10410
7f13657d 10411 return io_uring_create(entries, &p, params);
2b188cc1
JA
10412}
10413
10414SYSCALL_DEFINE2(io_uring_setup, u32, entries,
10415 struct io_uring_params __user *, params)
10416{
10417 return io_uring_setup(entries, params);
10418}
10419
c072481d
PB
10420static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
10421 unsigned nr_args)
66f4af93
JA
10422{
10423 struct io_uring_probe *p;
10424 size_t size;
10425 int i, ret;
10426
10427 size = struct_size(p, ops, nr_args);
10428 if (size == SIZE_MAX)
10429 return -EOVERFLOW;
10430 p = kzalloc(size, GFP_KERNEL);
10431 if (!p)
10432 return -ENOMEM;
10433
10434 ret = -EFAULT;
10435 if (copy_from_user(p, arg, size))
10436 goto out;
10437 ret = -EINVAL;
10438 if (memchr_inv(p, 0, size))
10439 goto out;
10440
10441 p->last_op = IORING_OP_LAST - 1;
10442 if (nr_args > IORING_OP_LAST)
10443 nr_args = IORING_OP_LAST;
10444
10445 for (i = 0; i < nr_args; i++) {
10446 p->ops[i].op = i;
10447 if (!io_op_defs[i].not_supported)
10448 p->ops[i].flags = IO_URING_OP_SUPPORTED;
10449 }
10450 p->ops_len = i;
10451
10452 ret = 0;
10453 if (copy_to_user(arg, p, size))
10454 ret = -EFAULT;
10455out:
10456 kfree(p);
10457 return ret;
10458}
10459
071698e1
JA
10460static int io_register_personality(struct io_ring_ctx *ctx)
10461{
4379bf8b 10462 const struct cred *creds;
61cf9370 10463 u32 id;
1e6fa521 10464 int ret;
071698e1 10465
4379bf8b 10466 creds = get_current_cred();
1e6fa521 10467
61cf9370
MWO
10468 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
10469 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
a30f895a
JA
10470 if (ret < 0) {
10471 put_cred(creds);
10472 return ret;
10473 }
10474 return id;
071698e1
JA
10475}
10476
c072481d
PB
10477static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
10478 void __user *arg, unsigned int nr_args)
21b55dbc
SG
10479{
10480 struct io_uring_restriction *res;
10481 size_t size;
10482 int i, ret;
10483
7e84e1c7
SG
10484 /* Restrictions allowed only if rings started disabled */
10485 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10486 return -EBADFD;
10487
21b55dbc 10488 /* We allow only a single restrictions registration */
7e84e1c7 10489 if (ctx->restrictions.registered)
21b55dbc
SG
10490 return -EBUSY;
10491
10492 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
10493 return -EINVAL;
10494
10495 size = array_size(nr_args, sizeof(*res));
10496 if (size == SIZE_MAX)
10497 return -EOVERFLOW;
10498
10499 res = memdup_user(arg, size);
10500 if (IS_ERR(res))
10501 return PTR_ERR(res);
10502
10503 ret = 0;
10504
10505 for (i = 0; i < nr_args; i++) {
10506 switch (res[i].opcode) {
10507 case IORING_RESTRICTION_REGISTER_OP:
10508 if (res[i].register_op >= IORING_REGISTER_LAST) {
10509 ret = -EINVAL;
10510 goto out;
10511 }
10512
10513 __set_bit(res[i].register_op,
10514 ctx->restrictions.register_op);
10515 break;
10516 case IORING_RESTRICTION_SQE_OP:
10517 if (res[i].sqe_op >= IORING_OP_LAST) {
10518 ret = -EINVAL;
10519 goto out;
10520 }
10521
10522 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10523 break;
10524 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10525 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10526 break;
10527 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10528 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10529 break;
10530 default:
10531 ret = -EINVAL;
10532 goto out;
10533 }
10534 }
10535
10536out:
10537 /* Reset all restrictions if an error happened */
10538 if (ret != 0)
10539 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10540 else
7e84e1c7 10541 ctx->restrictions.registered = true;
21b55dbc
SG
10542
10543 kfree(res);
10544 return ret;
10545}
10546
7e84e1c7
SG
10547static int io_register_enable_rings(struct io_ring_ctx *ctx)
10548{
10549 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10550 return -EBADFD;
10551
10552 if (ctx->restrictions.registered)
10553 ctx->restricted = 1;
10554
0298ef96
PB
10555 ctx->flags &= ~IORING_SETUP_R_DISABLED;
10556 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
10557 wake_up(&ctx->sq_data->wait);
7e84e1c7
SG
10558 return 0;
10559}
10560
fdecb662 10561static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 10562 struct io_uring_rsrc_update2 *up,
98f0b3b4
PB
10563 unsigned nr_args)
10564{
10565 __u32 tmp;
10566 int err;
10567
c3bdad02
PB
10568 if (up->resv)
10569 return -EINVAL;
98f0b3b4
PB
10570 if (check_add_overflow(up->offset, nr_args, &tmp))
10571 return -EOVERFLOW;
10572 err = io_rsrc_node_switch_start(ctx);
10573 if (err)
10574 return err;
10575
fdecb662
PB
10576 switch (type) {
10577 case IORING_RSRC_FILE:
98f0b3b4 10578 return __io_sqe_files_update(ctx, up, nr_args);
634d00df
PB
10579 case IORING_RSRC_BUFFER:
10580 return __io_sqe_buffers_update(ctx, up, nr_args);
98f0b3b4
PB
10581 }
10582 return -EINVAL;
10583}
10584
c3bdad02
PB
10585static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
10586 unsigned nr_args)
98f0b3b4 10587{
c3bdad02 10588 struct io_uring_rsrc_update2 up;
98f0b3b4
PB
10589
10590 if (!nr_args)
10591 return -EINVAL;
c3bdad02
PB
10592 memset(&up, 0, sizeof(up));
10593 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
10594 return -EFAULT;
10595 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
10596}
10597
10598static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
992da01a 10599 unsigned size, unsigned type)
c3bdad02
PB
10600{
10601 struct io_uring_rsrc_update2 up;
10602
10603 if (size != sizeof(up))
10604 return -EINVAL;
98f0b3b4
PB
10605 if (copy_from_user(&up, arg, sizeof(up)))
10606 return -EFAULT;
992da01a 10607 if (!up.nr || up.resv)
98f0b3b4 10608 return -EINVAL;
992da01a 10609 return __io_register_rsrc_update(ctx, type, &up, up.nr);
98f0b3b4
PB
10610}
10611
c072481d 10612static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
992da01a 10613 unsigned int size, unsigned int type)
792e3582
PB
10614{
10615 struct io_uring_rsrc_register rr;
10616
10617 /* keep it extendible */
10618 if (size != sizeof(rr))
10619 return -EINVAL;
10620
10621 memset(&rr, 0, sizeof(rr));
10622 if (copy_from_user(&rr, arg, size))
10623 return -EFAULT;
992da01a 10624 if (!rr.nr || rr.resv || rr.resv2)
792e3582
PB
10625 return -EINVAL;
10626
992da01a 10627 switch (type) {
792e3582
PB
10628 case IORING_RSRC_FILE:
10629 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10630 rr.nr, u64_to_user_ptr(rr.tags));
634d00df
PB
10631 case IORING_RSRC_BUFFER:
10632 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10633 rr.nr, u64_to_user_ptr(rr.tags));
792e3582
PB
10634 }
10635 return -EINVAL;
10636}
10637
c072481d
PB
10638static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
10639 void __user *arg, unsigned len)
fe76421d
JA
10640{
10641 struct io_uring_task *tctx = current->io_uring;
10642 cpumask_var_t new_mask;
10643 int ret;
10644
10645 if (!tctx || !tctx->io_wq)
10646 return -EINVAL;
10647
10648 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10649 return -ENOMEM;
10650
10651 cpumask_clear(new_mask);
10652 if (len > cpumask_size())
10653 len = cpumask_size();
10654
10655 if (copy_from_user(new_mask, arg, len)) {
10656 free_cpumask_var(new_mask);
10657 return -EFAULT;
10658 }
10659
10660 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10661 free_cpumask_var(new_mask);
10662 return ret;
10663}
10664
c072481d 10665static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
fe76421d
JA
10666{
10667 struct io_uring_task *tctx = current->io_uring;
10668
10669 if (!tctx || !tctx->io_wq)
10670 return -EINVAL;
10671
10672 return io_wq_cpu_affinity(tctx->io_wq, NULL);
10673}
10674
c072481d
PB
10675static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
10676 void __user *arg)
2e480058 10677{
fa84693b
JA
10678 struct io_uring_task *tctx = NULL;
10679 struct io_sq_data *sqd = NULL;
2e480058
JA
10680 __u32 new_count[2];
10681 int i, ret;
10682
2e480058
JA
10683 if (copy_from_user(new_count, arg, sizeof(new_count)))
10684 return -EFAULT;
10685 for (i = 0; i < ARRAY_SIZE(new_count); i++)
10686 if (new_count[i] > INT_MAX)
10687 return -EINVAL;
10688
fa84693b
JA
10689 if (ctx->flags & IORING_SETUP_SQPOLL) {
10690 sqd = ctx->sq_data;
10691 if (sqd) {
009ad9f0
JA
10692 /*
10693 * Observe the correct sqd->lock -> ctx->uring_lock
10694 * ordering. Fine to drop uring_lock here, we hold
10695 * a ref to the ctx.
10696 */
41d3a6bd 10697 refcount_inc(&sqd->refs);
009ad9f0 10698 mutex_unlock(&ctx->uring_lock);
fa84693b 10699 mutex_lock(&sqd->lock);
009ad9f0 10700 mutex_lock(&ctx->uring_lock);
41d3a6bd
JA
10701 if (sqd->thread)
10702 tctx = sqd->thread->io_uring;
fa84693b
JA
10703 }
10704 } else {
10705 tctx = current->io_uring;
10706 }
10707
10708 ret = -EINVAL;
10709 if (!tctx || !tctx->io_wq)
10710 goto err;
10711
2e480058
JA
10712 ret = io_wq_max_workers(tctx->io_wq, new_count);
10713 if (ret)
fa84693b
JA
10714 goto err;
10715
41d3a6bd 10716 if (sqd) {
fa84693b 10717 mutex_unlock(&sqd->lock);
41d3a6bd
JA
10718 io_put_sq_data(sqd);
10719 }
2e480058
JA
10720
10721 if (copy_to_user(arg, new_count, sizeof(new_count)))
10722 return -EFAULT;
10723
10724 return 0;
fa84693b 10725err:
41d3a6bd 10726 if (sqd) {
fa84693b 10727 mutex_unlock(&sqd->lock);
41d3a6bd
JA
10728 io_put_sq_data(sqd);
10729 }
fa84693b 10730 return ret;
2e480058
JA
10731}
10732
071698e1
JA
10733static bool io_register_op_must_quiesce(int op)
10734{
10735 switch (op) {
bd54b6fe
BM
10736 case IORING_REGISTER_BUFFERS:
10737 case IORING_UNREGISTER_BUFFERS:
f4f7d21c 10738 case IORING_REGISTER_FILES:
071698e1
JA
10739 case IORING_UNREGISTER_FILES:
10740 case IORING_REGISTER_FILES_UPDATE:
10741 case IORING_REGISTER_PROBE:
10742 case IORING_REGISTER_PERSONALITY:
10743 case IORING_UNREGISTER_PERSONALITY:
992da01a
PB
10744 case IORING_REGISTER_FILES2:
10745 case IORING_REGISTER_FILES_UPDATE2:
10746 case IORING_REGISTER_BUFFERS2:
10747 case IORING_REGISTER_BUFFERS_UPDATE:
fe76421d
JA
10748 case IORING_REGISTER_IOWQ_AFF:
10749 case IORING_UNREGISTER_IOWQ_AFF:
2e480058 10750 case IORING_REGISTER_IOWQ_MAX_WORKERS:
071698e1
JA
10751 return false;
10752 default:
10753 return true;
10754 }
10755}
10756
c072481d 10757static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
e73c5c7c
PB
10758{
10759 long ret;
10760
10761 percpu_ref_kill(&ctx->refs);
10762
10763 /*
10764 * Drop uring mutex before waiting for references to exit. If another
10765 * thread is currently inside io_uring_enter() it might need to grab the
10766 * uring_lock to make progress. If we hold it here across the drain
10767 * wait, then we can deadlock. It's safe to drop the mutex here, since
10768 * no new references will come in after we've killed the percpu ref.
10769 */
10770 mutex_unlock(&ctx->uring_lock);
10771 do {
37f0e767
PB
10772 ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
10773 if (ret) {
10774 ret = min(0L, ret);
e73c5c7c 10775 break;
37f0e767
PB
10776 }
10777
e73c5c7c 10778 ret = io_run_task_work_sig();
37f0e767 10779 io_req_caches_free(ctx);
e73c5c7c
PB
10780 } while (ret >= 0);
10781 mutex_lock(&ctx->uring_lock);
10782
10783 if (ret)
10784 io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10785 return ret;
10786}
10787
edafccee
JA
10788static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10789 void __user *arg, unsigned nr_args)
b19062a5
JA
10790 __releases(ctx->uring_lock)
10791 __acquires(ctx->uring_lock)
edafccee
JA
10792{
10793 int ret;
10794
35fa71a0
JA
10795 /*
10796 * We're inside the ring mutex, if the ref is already dying, then
10797 * someone else killed the ctx or is already going through
10798 * io_uring_register().
10799 */
10800 if (percpu_ref_is_dying(&ctx->refs))
10801 return -ENXIO;
10802
75c4021a
PB
10803 if (ctx->restricted) {
10804 if (opcode >= IORING_REGISTER_LAST)
10805 return -EINVAL;
10806 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10807 if (!test_bit(opcode, ctx->restrictions.register_op))
10808 return -EACCES;
10809 }
10810
071698e1 10811 if (io_register_op_must_quiesce(opcode)) {
e73c5c7c
PB
10812 ret = io_ctx_quiesce(ctx);
10813 if (ret)
f70865db 10814 return ret;
05f3fb3c 10815 }
edafccee
JA
10816
10817 switch (opcode) {
10818 case IORING_REGISTER_BUFFERS:
634d00df 10819 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
edafccee
JA
10820 break;
10821 case IORING_UNREGISTER_BUFFERS:
10822 ret = -EINVAL;
10823 if (arg || nr_args)
10824 break;
0a96bbe4 10825 ret = io_sqe_buffers_unregister(ctx);
edafccee 10826 break;
6b06314c 10827 case IORING_REGISTER_FILES:
792e3582 10828 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
6b06314c
JA
10829 break;
10830 case IORING_UNREGISTER_FILES:
10831 ret = -EINVAL;
10832 if (arg || nr_args)
10833 break;
10834 ret = io_sqe_files_unregister(ctx);
10835 break;
c3a31e60 10836 case IORING_REGISTER_FILES_UPDATE:
c3bdad02 10837 ret = io_register_files_update(ctx, arg, nr_args);
c3a31e60 10838 break;
9b402849 10839 case IORING_REGISTER_EVENTFD:
f2842ab5 10840 case IORING_REGISTER_EVENTFD_ASYNC:
9b402849
JA
10841 ret = -EINVAL;
10842 if (nr_args != 1)
10843 break;
10844 ret = io_eventfd_register(ctx, arg);
f2842ab5
JA
10845 if (ret)
10846 break;
10847 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10848 ctx->eventfd_async = 1;
10849 else
10850 ctx->eventfd_async = 0;
9b402849
JA
10851 break;
10852 case IORING_UNREGISTER_EVENTFD:
10853 ret = -EINVAL;
10854 if (arg || nr_args)
10855 break;
10856 ret = io_eventfd_unregister(ctx);
10857 break;
66f4af93
JA
10858 case IORING_REGISTER_PROBE:
10859 ret = -EINVAL;
10860 if (!arg || nr_args > 256)
10861 break;
10862 ret = io_probe(ctx, arg, nr_args);
10863 break;
071698e1
JA
10864 case IORING_REGISTER_PERSONALITY:
10865 ret = -EINVAL;
10866 if (arg || nr_args)
10867 break;
10868 ret = io_register_personality(ctx);
10869 break;
10870 case IORING_UNREGISTER_PERSONALITY:
10871 ret = -EINVAL;
10872 if (arg)
10873 break;
10874 ret = io_unregister_personality(ctx, nr_args);
10875 break;
7e84e1c7
SG
10876 case IORING_REGISTER_ENABLE_RINGS:
10877 ret = -EINVAL;
10878 if (arg || nr_args)
10879 break;
10880 ret = io_register_enable_rings(ctx);
10881 break;
21b55dbc
SG
10882 case IORING_REGISTER_RESTRICTIONS:
10883 ret = io_register_restrictions(ctx, arg, nr_args);
10884 break;
992da01a
PB
10885 case IORING_REGISTER_FILES2:
10886 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
10887 break;
10888 case IORING_REGISTER_FILES_UPDATE2:
10889 ret = io_register_rsrc_update(ctx, arg, nr_args,
10890 IORING_RSRC_FILE);
10891 break;
10892 case IORING_REGISTER_BUFFERS2:
10893 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
792e3582 10894 break;
992da01a
PB
10895 case IORING_REGISTER_BUFFERS_UPDATE:
10896 ret = io_register_rsrc_update(ctx, arg, nr_args,
10897 IORING_RSRC_BUFFER);
c3bdad02 10898 break;
fe76421d
JA
10899 case IORING_REGISTER_IOWQ_AFF:
10900 ret = -EINVAL;
10901 if (!arg || !nr_args)
10902 break;
10903 ret = io_register_iowq_aff(ctx, arg, nr_args);
10904 break;
10905 case IORING_UNREGISTER_IOWQ_AFF:
10906 ret = -EINVAL;
10907 if (arg || nr_args)
10908 break;
10909 ret = io_unregister_iowq_aff(ctx);
10910 break;
2e480058
JA
10911 case IORING_REGISTER_IOWQ_MAX_WORKERS:
10912 ret = -EINVAL;
10913 if (!arg || nr_args != 2)
10914 break;
10915 ret = io_register_iowq_max_workers(ctx, arg);
10916 break;
edafccee
JA
10917 default:
10918 ret = -EINVAL;
10919 break;
10920 }
10921
071698e1 10922 if (io_register_op_must_quiesce(opcode)) {
05f3fb3c 10923 /* bring the ctx back to life */
05f3fb3c 10924 percpu_ref_reinit(&ctx->refs);
0f158b4c 10925 reinit_completion(&ctx->ref_comp);
05f3fb3c 10926 }
edafccee
JA
10927 return ret;
10928}
10929
10930SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
10931 void __user *, arg, unsigned int, nr_args)
10932{
10933 struct io_ring_ctx *ctx;
10934 long ret = -EBADF;
10935 struct fd f;
10936
10937 f = fdget(fd);
10938 if (!f.file)
10939 return -EBADF;
10940
10941 ret = -EOPNOTSUPP;
10942 if (f.file->f_op != &io_uring_fops)
10943 goto out_fput;
10944
10945 ctx = f.file->private_data;
10946
b6c23dd5
PB
10947 io_run_task_work();
10948
edafccee
JA
10949 mutex_lock(&ctx->uring_lock);
10950 ret = __io_uring_register(ctx, opcode, arg, nr_args);
10951 mutex_unlock(&ctx->uring_lock);
c826bd7a
DD
10952 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
10953 ctx->cq_ev_fd != NULL, ret);
edafccee
JA
10954out_fput:
10955 fdput(f);
10956 return ret;
10957}
10958
2b188cc1
JA
10959static int __init io_uring_init(void)
10960{
d7f62e82
SM
10961#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
10962 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
10963 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
10964} while (0)
10965
10966#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
10967 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
10968 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
10969 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
10970 BUILD_BUG_SQE_ELEM(1, __u8, flags);
10971 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
10972 BUILD_BUG_SQE_ELEM(4, __s32, fd);
10973 BUILD_BUG_SQE_ELEM(8, __u64, off);
10974 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
10975 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 10976 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
10977 BUILD_BUG_SQE_ELEM(24, __u32, len);
10978 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
10979 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
10980 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
10981 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
5769a351
JX
10982 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
10983 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
d7f62e82
SM
10984 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
10985 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
10986 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
10987 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
10988 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
10989 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
10990 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
10991 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 10992 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
10993 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
10994 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
16340eab 10995 BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
d7f62e82 10996 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 10997 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
b9445598 10998 BUILD_BUG_SQE_ELEM(44, __u32, file_index);
d7f62e82 10999
b0d658ec
PB
11000 BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11001 sizeof(struct io_uring_rsrc_update));
11002 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11003 sizeof(struct io_uring_rsrc_update2));
90499ad0
PB
11004
11005 /* ->buf_index is u16 */
11006 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11007
b0d658ec
PB
11008 /* should fit into one byte */
11009 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
68fe256a
PB
11010 BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
11011 BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
b0d658ec 11012
d3656344 11013 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
32c2d33e 11014 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
16340eab 11015
91f245d5
JA
11016 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11017 SLAB_ACCOUNT);
2b188cc1
JA
11018 return 0;
11019};
11020__initcall(io_uring_init);