io_uring: inline io_req_complete_fail_submit()
[linux-block.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
d068b506 14 * through a control-dependency in io_get_cqe (smp_store_release to
1e84b97b
SB
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
52de1fe1 47#include <net/compat.h>
2b188cc1
JA
48#include <linux/refcount.h>
49#include <linux/uio.h>
6b47ee6e 50#include <linux/bits.h>
2b188cc1
JA
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
2b188cc1
JA
58#include <linux/percpu.h>
59#include <linux/slab.h>
edce22e1 60#include <linux/blk-mq.h>
edafccee 61#include <linux/bvec.h>
2b188cc1
JA
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
6b06314c 65#include <net/scm.h>
2b188cc1
JA
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
edafccee
JA
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
aa4c3967 72#include <linux/highmem.h>
15b71abe
JA
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
4840e418 75#include <linux/fadvise.h>
3e4827b0 76#include <linux/eventpoll.h>
7d67af2c 77#include <linux/splice.h>
b41e9852 78#include <linux/task_work.h>
bcf5a063 79#include <linux/pagemap.h>
0f212204 80#include <linux/io_uring.h>
5bd2182d 81#include <linux/audit.h>
cdc1404a 82#include <linux/security.h>
2b188cc1 83
c826bd7a
DD
84#define CREATE_TRACE_POINTS
85#include <trace/events/io_uring.h>
86
2b188cc1
JA
87#include <uapi/linux/io_uring.h>
88
89#include "internal.h"
561fb04a 90#include "io-wq.h"
2b188cc1 91
5277deaa 92#define IORING_MAX_ENTRIES 32768
33a107f0 93#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
4ce8ad95 94#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
65e19f54 95
187f08c1 96/* only define max */
042b0d85 97#define IORING_MAX_FIXED_FILES (1U << 15)
21b55dbc
SG
98#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
99 IORING_REGISTER_LAST + IORING_OP_LAST)
2b188cc1 100
187f08c1 101#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
2d091d62
PB
102#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
103#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
104
489809e2
PB
105#define IORING_MAX_REG_BUFFERS (1U << 14)
106
68fe256a
PB
107#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
108 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
109
5562a8d7
PB
110#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
111 IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
68fe256a 112
c854357b 113#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
d5361233 114 REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
b16fed66 115
a538be5b
PB
116#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
117 IO_REQ_CLEAN_FLAGS)
118
09899b19
PB
119#define IO_TCTX_REFS_CACHE_NR (1U << 10)
120
2b188cc1
JA
121struct io_uring {
122 u32 head ____cacheline_aligned_in_smp;
123 u32 tail ____cacheline_aligned_in_smp;
124};
125
1e84b97b 126/*
75b28aff
HV
127 * This data is shared with the application through the mmap at offsets
128 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
129 *
130 * The offsets to the member fields are published through struct
131 * io_sqring_offsets when calling io_uring_setup.
132 */
75b28aff 133struct io_rings {
1e84b97b
SB
134 /*
135 * Head and tail offsets into the ring; the offsets need to be
136 * masked to get valid indices.
137 *
75b28aff
HV
138 * The kernel controls head of the sq ring and the tail of the cq ring,
139 * and the application controls tail of the sq ring and the head of the
140 * cq ring.
1e84b97b 141 */
75b28aff 142 struct io_uring sq, cq;
1e84b97b 143 /*
75b28aff 144 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
145 * ring_entries - 1)
146 */
75b28aff
HV
147 u32 sq_ring_mask, cq_ring_mask;
148 /* Ring sizes (constant, power of 2) */
149 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
150 /*
151 * Number of invalid entries dropped by the kernel due to
152 * invalid index stored in array
153 *
154 * Written by the kernel, shouldn't be modified by the
155 * application (i.e. get number of "new events" by comparing to
156 * cached value).
157 *
158 * After a new SQ head value was read by the application this
159 * counter includes all submissions that were dropped reaching
160 * the new SQ head (and possibly more).
161 */
75b28aff 162 u32 sq_dropped;
1e84b97b 163 /*
0d9b5b3a 164 * Runtime SQ flags
1e84b97b
SB
165 *
166 * Written by the kernel, shouldn't be modified by the
167 * application.
168 *
169 * The application needs a full memory barrier before checking
170 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
171 */
75b28aff 172 u32 sq_flags;
0d9b5b3a
SG
173 /*
174 * Runtime CQ flags
175 *
176 * Written by the application, shouldn't be modified by the
177 * kernel.
178 */
fe7e3257 179 u32 cq_flags;
1e84b97b
SB
180 /*
181 * Number of completion events lost because the queue was full;
182 * this should be avoided by the application by making sure
0b4295b5 183 * there are not more requests pending than there is space in
1e84b97b
SB
184 * the completion queue.
185 *
186 * Written by the kernel, shouldn't be modified by the
187 * application (i.e. get number of "new events" by comparing to
188 * cached value).
189 *
190 * As completion events come in out of order this counter is not
191 * ordered with any other data.
192 */
75b28aff 193 u32 cq_overflow;
1e84b97b
SB
194 /*
195 * Ring buffer of completion events.
196 *
197 * The kernel writes completion events fresh every time they are
198 * produced, so the application is allowed to modify pending
199 * entries.
200 */
75b28aff 201 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
202};
203
45d189c6 204enum io_uring_cmd_flags {
51aac424 205 IO_URING_F_COMPLETE_DEFER = 1,
3b44b371 206 IO_URING_F_UNLOCKED = 2,
51aac424
PB
207 /* int's last bit, sign checks are usually faster than a bit test */
208 IO_URING_F_NONBLOCK = INT_MIN,
45d189c6
PB
209};
210
edafccee
JA
211struct io_mapped_ubuf {
212 u64 ubuf;
4751f53d 213 u64 ubuf_end;
edafccee 214 unsigned int nr_bvecs;
de293938 215 unsigned long acct_pages;
41edf1a5 216 struct bio_vec bvec[];
edafccee
JA
217};
218
50238531
BM
219struct io_ring_ctx;
220
6c2450ae
PB
221struct io_overflow_cqe {
222 struct io_uring_cqe cqe;
223 struct list_head list;
224};
225
a04b0ac0
PB
226struct io_fixed_file {
227 /* file * with additional FFS_* flags */
228 unsigned long file_ptr;
229};
230
269bbe5f
BM
231struct io_rsrc_put {
232 struct list_head list;
b60c8dce 233 u64 tag;
50238531
BM
234 union {
235 void *rsrc;
236 struct file *file;
bd54b6fe 237 struct io_mapped_ubuf *buf;
50238531 238 };
269bbe5f
BM
239};
240
aeca241b 241struct io_file_table {
042b0d85 242 struct io_fixed_file *files;
31b51510
JA
243};
244
b895c9a6 245struct io_rsrc_node {
05589553
XW
246 struct percpu_ref refs;
247 struct list_head node;
269bbe5f 248 struct list_head rsrc_list;
b895c9a6 249 struct io_rsrc_data *rsrc_data;
4a38aed2 250 struct llist_node llist;
e297822b 251 bool done;
05589553
XW
252};
253
40ae0ff7
PB
254typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
255
b895c9a6 256struct io_rsrc_data {
05f3fb3c
JA
257 struct io_ring_ctx *ctx;
258
2d091d62
PB
259 u64 **tags;
260 unsigned int nr;
40ae0ff7 261 rsrc_put_fn *do_put;
3e942498 262 atomic_t refs;
05f3fb3c 263 struct completion done;
8bad28d8 264 bool quiesce;
05f3fb3c
JA
265};
266
dbc7d452
JA
267struct io_buffer_list {
268 struct list_head list;
269 struct list_head buf_list;
270 __u16 bgid;
271};
272
5a2e745d
JA
273struct io_buffer {
274 struct list_head list;
275 __u64 addr;
d1f82808 276 __u32 len;
5a2e745d 277 __u16 bid;
b1c62645 278 __u16 bgid;
5a2e745d
JA
279};
280
21b55dbc
SG
281struct io_restriction {
282 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
283 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
284 u8 sqe_flags_allowed;
285 u8 sqe_flags_required;
7e84e1c7 286 bool registered;
21b55dbc
SG
287};
288
37d1e2e3
JA
289enum {
290 IO_SQ_THREAD_SHOULD_STOP = 0,
291 IO_SQ_THREAD_SHOULD_PARK,
292};
293
534ca6d6
JA
294struct io_sq_data {
295 refcount_t refs;
9e138a48 296 atomic_t park_pending;
09a6f4ef 297 struct mutex lock;
69fb2131
JA
298
299 /* ctx's that are using this sqd */
300 struct list_head ctx_list;
69fb2131 301
534ca6d6
JA
302 struct task_struct *thread;
303 struct wait_queue_head wait;
08369246
XW
304
305 unsigned sq_thread_idle;
37d1e2e3
JA
306 int sq_cpu;
307 pid_t task_pid;
5c2469e0 308 pid_t task_tgid;
37d1e2e3
JA
309
310 unsigned long state;
37d1e2e3 311 struct completion exited;
534ca6d6
JA
312};
313
6dd0be1e 314#define IO_COMPL_BATCH 32
6ff119a6 315#define IO_REQ_CACHE_SIZE 32
bf019da7 316#define IO_REQ_ALLOC_BATCH 8
258b29a9 317
a1ab7b35
PB
318struct io_submit_link {
319 struct io_kiocb *head;
320 struct io_kiocb *last;
321};
322
258b29a9 323struct io_submit_state {
5a158c6b
PB
324 /* inline/task_work completion list, under ->uring_lock */
325 struct io_wq_work_node free_list;
326 /* batch completion logic */
327 struct io_wq_work_list compl_reqs;
a1ab7b35 328 struct io_submit_link link;
258b29a9 329
258b29a9 330 bool plug_started;
4b628aeb 331 bool need_plug;
3d4aeb9f 332 bool flush_cqes;
5ca7a8b3 333 unsigned short submit_nr;
5a158c6b 334 struct blk_plug plug;
258b29a9
PB
335};
336
77bc59b4
UA
337struct io_ev_fd {
338 struct eventfd_ctx *cq_ev_fd;
c75312dd 339 unsigned int eventfd_async: 1;
77bc59b4
UA
340 struct rcu_head rcu;
341};
342
dbc7d452
JA
343#define IO_BUFFERS_HASH_BITS 5
344
2b188cc1 345struct io_ring_ctx {
b52ecf8c 346 /* const or read-mostly hot data */
2b188cc1
JA
347 struct {
348 struct percpu_ref refs;
2b188cc1 349
b52ecf8c 350 struct io_rings *rings;
2b188cc1 351 unsigned int flags;
e1d85334 352 unsigned int compat: 1;
e1d85334 353 unsigned int drain_next: 1;
21b55dbc 354 unsigned int restricted: 1;
f18ee4cf 355 unsigned int off_timeout_used: 1;
10c66904 356 unsigned int drain_active: 1;
5562a8d7 357 unsigned int drain_disabled: 1;
9aa8dfde 358 unsigned int has_evfd: 1;
773697b6 359 unsigned int syscall_iopoll: 1;
b52ecf8c 360 } ____cacheline_aligned_in_smp;
2b188cc1 361
7f1129d2 362 /* submission data */
b52ecf8c 363 struct {
0499e582
PB
364 struct mutex uring_lock;
365
75b28aff
HV
366 /*
367 * Ring buffer of indices into array of io_uring_sqe, which is
368 * mmapped by the application using the IORING_OFF_SQES offset.
369 *
370 * This indirection could e.g. be used to assign fixed
371 * io_uring_sqe entries to operations and only submit them to
372 * the queue when needed.
373 *
374 * The kernel modifies neither the indices array nor the entries
375 * array.
376 */
377 u32 *sq_array;
c7af47cf 378 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
379 unsigned cached_sq_head;
380 unsigned sq_entries;
de0617e4 381 struct list_head defer_list;
7f1129d2
PB
382
383 /*
384 * Fixed resources fast path, should be accessed only under
385 * uring_lock, and updated through io_uring_register(2)
386 */
387 struct io_rsrc_node *rsrc_node;
ab409402 388 int rsrc_cached_refs;
7f1129d2
PB
389 struct io_file_table file_table;
390 unsigned nr_user_files;
391 unsigned nr_user_bufs;
392 struct io_mapped_ubuf **user_bufs;
393
394 struct io_submit_state submit_state;
5262f567 395 struct list_head timeout_list;
ef9dd637 396 struct list_head ltimeout_list;
1d7bb1d5 397 struct list_head cq_overflow_list;
dbc7d452 398 struct list_head *io_buffers;
cc3cec83 399 struct list_head io_buffers_cache;
4d9237e3 400 struct list_head apoll_cache;
7f1129d2
PB
401 struct xarray personalities;
402 u32 pers_next;
403 unsigned sq_thread_idle;
2b188cc1
JA
404 } ____cacheline_aligned_in_smp;
405
d0acdee2 406 /* IRQ completion list, under ->completion_lock */
c2b6c6bc 407 struct io_wq_work_list locked_free_list;
d0acdee2 408 unsigned int locked_free_nr;
3c1a2ead 409
7c30f36a 410 const struct cred *sq_creds; /* cred used for __io_sq_thread() */
534ca6d6
JA
411 struct io_sq_data *sq_data; /* if using sq thread polling */
412
90554200 413 struct wait_queue_head sqo_sq_wait;
69fb2131 414 struct list_head sqd_list;
75b28aff 415
5ed7a37d
PB
416 unsigned long check_cq_overflow;
417
206aefde 418 struct {
d8da428b
PB
419 /*
420 * We cache a range of free CQEs we can use, once exhausted it
421 * should go through a slower range setup, see __io_get_cqe()
422 */
423 struct io_uring_cqe *cqe_cached;
424 struct io_uring_cqe *cqe_sentinel;
425
206aefde
JA
426 unsigned cached_cq_tail;
427 unsigned cq_entries;
77bc59b4 428 struct io_ev_fd __rcu *io_ev_fd;
0499e582
PB
429 struct wait_queue_head cq_wait;
430 unsigned cq_extra;
431 atomic_t cq_timeouts;
0499e582 432 unsigned cq_last_tm_flush;
206aefde 433 } ____cacheline_aligned_in_smp;
2b188cc1 434
2b188cc1
JA
435 struct {
436 spinlock_t completion_lock;
e94f141b 437
89850fce
JA
438 spinlock_t timeout_lock;
439
def596e9 440 /*
540e32a0 441 * ->iopoll_list is protected by the ctx->uring_lock for
def596e9
JA
442 * io_uring instances that don't use IORING_SETUP_SQPOLL.
443 * For SQPOLL, only the single threaded io_sq_thread() will
444 * manipulate the list, hence no extra locking is needed there.
445 */
5eef4e87 446 struct io_wq_work_list iopoll_list;
78076bb6
JA
447 struct hlist_head *cancel_hash;
448 unsigned cancel_hash_bits;
915b3dde 449 bool poll_multi_queue;
cc3cec83
JA
450
451 struct list_head io_buffers_comp;
2b188cc1 452 } ____cacheline_aligned_in_smp;
85faa7b8 453
21b55dbc 454 struct io_restriction restrictions;
3c1a2ead 455
b13a8918
PB
456 /* slow path rsrc auxilary data, used by update/register */
457 struct {
458 struct io_rsrc_node *rsrc_backup_node;
459 struct io_mapped_ubuf *dummy_ubuf;
460 struct io_rsrc_data *file_data;
461 struct io_rsrc_data *buf_data;
462
463 struct delayed_work rsrc_put_work;
464 struct llist_head rsrc_put_llist;
465 struct list_head rsrc_ref_list;
466 spinlock_t rsrc_ref_lock;
cc3cec83
JA
467
468 struct list_head io_buffers_pages;
b13a8918
PB
469 };
470
3c1a2ead 471 /* Keep this last, we don't need it for the fast path */
b986af7e
PB
472 struct {
473 #if defined(CONFIG_UNIX)
474 struct socket *ring_sock;
475 #endif
476 /* hashed buffered write serialization */
477 struct io_wq_hash *hash_map;
478
479 /* Only used for accounting purposes */
480 struct user_struct *user;
481 struct mm_struct *mm_account;
482
483 /* ctx exit and cancelation */
9011bf9a
PB
484 struct llist_head fallback_llist;
485 struct delayed_work fallback_work;
b986af7e
PB
486 struct work_struct exit_work;
487 struct list_head tctx_list;
488 struct completion ref_comp;
e139a1ec
PB
489 u32 iowq_limits[2];
490 bool iowq_limits_set;
b986af7e 491 };
2b188cc1
JA
492};
493
e7a6c00d
JA
494/*
495 * Arbitrary limit, can be raised if need be
496 */
497#define IO_RINGFD_REG_MAX 16
498
53e043b2
SM
499struct io_uring_task {
500 /* submission side */
09899b19 501 int cached_refs;
53e043b2
SM
502 struct xarray xa;
503 struct wait_queue_head wait;
ee53fb2b
SM
504 const struct io_ring_ctx *last;
505 struct io_wq *io_wq;
53e043b2
SM
506 struct percpu_counter inflight;
507 atomic_t in_idle;
53e043b2
SM
508
509 spinlock_t task_lock;
510 struct io_wq_work_list task_list;
4813c377 511 struct io_wq_work_list prior_task_list;
53e043b2 512 struct callback_head task_work;
e7a6c00d 513 struct file **registered_rings;
6294f368 514 bool task_running;
53e043b2
SM
515};
516
09bb8394
JA
517/*
518 * First field must be the file pointer in all the
519 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
520 */
221c5eb2
JA
521struct io_poll_iocb {
522 struct file *file;
018043be 523 struct wait_queue_head *head;
221c5eb2 524 __poll_t events;
392edb45 525 struct wait_queue_entry wait;
221c5eb2
JA
526};
527
9d805892 528struct io_poll_update {
018043be 529 struct file *file;
9d805892
PB
530 u64 old_user_data;
531 u64 new_user_data;
532 __poll_t events;
b69de288
JA
533 bool update_events;
534 bool update_user_data;
018043be
PB
535};
536
b5dba59e
JA
537struct io_close {
538 struct file *file;
b5dba59e 539 int fd;
7df778be 540 u32 file_slot;
b5dba59e
JA
541};
542
ad8a48ac
JA
543struct io_timeout_data {
544 struct io_kiocb *req;
545 struct hrtimer timer;
546 struct timespec64 ts;
547 enum hrtimer_mode mode;
50c1df2b 548 u32 flags;
ad8a48ac
JA
549};
550
8ed8d3c3
JA
551struct io_accept {
552 struct file *file;
553 struct sockaddr __user *addr;
554 int __user *addr_len;
555 int flags;
aaa4db12 556 u32 file_slot;
09952e3e 557 unsigned long nofile;
8ed8d3c3
JA
558};
559
560struct io_sync {
561 struct file *file;
562 loff_t len;
563 loff_t off;
564 int flags;
d63d1b5e 565 int mode;
8ed8d3c3
JA
566};
567
fbf23849
JA
568struct io_cancel {
569 struct file *file;
570 u64 addr;
571};
572
b29472ee
JA
573struct io_timeout {
574 struct file *file;
bfe68a22
PB
575 u32 off;
576 u32 target_seq;
135fcde8 577 struct list_head list;
90cd7e42
PB
578 /* head of the link, used by linked timeouts only */
579 struct io_kiocb *head;
89b263f6
JA
580 /* for linked completions */
581 struct io_kiocb *prev;
b29472ee
JA
582};
583
0bdf7a2d
PB
584struct io_timeout_rem {
585 struct file *file;
586 u64 addr;
9c8e11b3
PB
587
588 /* timeout update */
589 struct timespec64 ts;
590 u32 flags;
f1042b6c 591 bool ltimeout;
0bdf7a2d
PB
592};
593
9adbd45d
JA
594struct io_rw {
595 /* NOTE: kiocb has the file as the first member, so don't do it here */
596 struct kiocb kiocb;
597 u64 addr;
584b0180
JA
598 u32 len;
599 u32 flags;
9adbd45d
JA
600};
601
3fbb51c1
JA
602struct io_connect {
603 struct file *file;
604 struct sockaddr __user *addr;
605 int addr_len;
606};
607
e47293fd
JA
608struct io_sr_msg {
609 struct file *file;
fddaface 610 union {
4af3417a
PB
611 struct compat_msghdr __user *umsg_compat;
612 struct user_msghdr __user *umsg;
613 void __user *buf;
fddaface 614 };
e47293fd 615 int msg_flags;
bcda7baa 616 int bgid;
fddaface 617 size_t len;
7ba89d2a 618 size_t done_io;
e47293fd
JA
619};
620
15b71abe
JA
621struct io_open {
622 struct file *file;
623 int dfd;
b9445598 624 u32 file_slot;
15b71abe 625 struct filename *filename;
c12cedf2 626 struct open_how how;
4022e7af 627 unsigned long nofile;
15b71abe
JA
628};
629
269bbe5f 630struct io_rsrc_update {
05f3fb3c
JA
631 struct file *file;
632 u64 arg;
633 u32 nr_args;
634 u32 offset;
635};
636
4840e418
JA
637struct io_fadvise {
638 struct file *file;
639 u64 offset;
640 u32 len;
641 u32 advice;
642};
643
c1ca757b
JA
644struct io_madvise {
645 struct file *file;
646 u64 addr;
647 u32 len;
648 u32 advice;
649};
650
3e4827b0
JA
651struct io_epoll {
652 struct file *file;
653 int epfd;
654 int op;
655 int fd;
656 struct epoll_event event;
e47293fd
JA
657};
658
7d67af2c
PB
659struct io_splice {
660 struct file *file_out;
7d67af2c
PB
661 loff_t off_out;
662 loff_t off_in;
663 u64 len;
a3e4bc23 664 int splice_fd_in;
7d67af2c
PB
665 unsigned int flags;
666};
667
ddf0322d
JA
668struct io_provide_buf {
669 struct file *file;
670 __u64 addr;
38134ada 671 __u32 len;
ddf0322d
JA
672 __u32 bgid;
673 __u16 nbufs;
674 __u16 bid;
675};
676
1d9e1288
BM
677struct io_statx {
678 struct file *file;
679 int dfd;
680 unsigned int mask;
681 unsigned int flags;
1b6fe6e0 682 struct filename *filename;
1d9e1288
BM
683 struct statx __user *buffer;
684};
685
36f4fa68
JA
686struct io_shutdown {
687 struct file *file;
688 int how;
689};
690
80a261fd
JA
691struct io_rename {
692 struct file *file;
693 int old_dfd;
694 int new_dfd;
695 struct filename *oldpath;
696 struct filename *newpath;
697 int flags;
698};
699
14a1143b
JA
700struct io_unlink {
701 struct file *file;
702 int dfd;
703 int flags;
704 struct filename *filename;
705};
706
e34a02dc
DK
707struct io_mkdir {
708 struct file *file;
709 int dfd;
710 umode_t mode;
711 struct filename *filename;
712};
713
7a8721f8
DK
714struct io_symlink {
715 struct file *file;
716 int new_dfd;
717 struct filename *oldpath;
718 struct filename *newpath;
719};
720
cf30da90
DK
721struct io_hardlink {
722 struct file *file;
723 int old_dfd;
724 int new_dfd;
725 struct filename *oldpath;
726 struct filename *newpath;
727 int flags;
728};
729
4f57f06c
JA
730struct io_msg {
731 struct file *file;
732 u64 user_data;
733 u32 len;
734};
735
f499a021
JA
736struct io_async_connect {
737 struct sockaddr_storage address;
738};
739
03b1230c
JA
740struct io_async_msghdr {
741 struct iovec fast_iov[UIO_FASTIOV];
257e84a5
PB
742 /* points to an allocated iov, if NULL we use fast_iov instead */
743 struct iovec *free_iov;
03b1230c
JA
744 struct sockaddr __user *uaddr;
745 struct msghdr msg;
b537916c 746 struct sockaddr_storage addr;
03b1230c
JA
747};
748
538941e2 749struct io_rw_state {
ff6165b2 750 struct iov_iter iter;
cd658695 751 struct iov_iter_state iter_state;
c88598a9 752 struct iovec fast_iov[UIO_FASTIOV];
538941e2
PB
753};
754
755struct io_async_rw {
756 struct io_rw_state s;
757 const struct iovec *free_iovec;
227c0c96 758 size_t bytes_done;
bcf5a063 759 struct wait_page_queue wpq;
f67676d1
JA
760};
761
6b47ee6e
PB
762enum {
763 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
764 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
765 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
766 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
767 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
bcda7baa 768 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
04c76b41 769 REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
6b47ee6e 770
dddca226 771 /* first byte is taken by user flags, shift it to not overlap */
93d2bcd2 772 REQ_F_FAIL_BIT = 8,
6b47ee6e
PB
773 REQ_F_INFLIGHT_BIT,
774 REQ_F_CUR_POS_BIT,
775 REQ_F_NOWAIT_BIT,
6b47ee6e 776 REQ_F_LINK_TIMEOUT_BIT,
99bc4c38 777 REQ_F_NEED_CLEANUP_BIT,
d7718a9d 778 REQ_F_POLLED_BIT,
bcda7baa 779 REQ_F_BUFFER_SELECTED_BIT,
e342c807 780 REQ_F_COMPLETE_INLINE_BIT,
230d50d4 781 REQ_F_REISSUE_BIT,
b8e64b53 782 REQ_F_CREDS_BIT,
20e60a38 783 REQ_F_REFCOUNT_BIT,
4d13d1a4 784 REQ_F_ARM_LTIMEOUT_BIT,
d886e185 785 REQ_F_ASYNC_DATA_BIT,
04c76b41 786 REQ_F_SKIP_LINK_CQES_BIT,
91eac1c6
JA
787 REQ_F_SINGLE_POLL_BIT,
788 REQ_F_DOUBLE_POLL_BIT,
8a3e8ee5 789 REQ_F_PARTIAL_IO_BIT,
7b29f92d 790 /* keep async read/write and isreg together and in order */
35645ac3 791 REQ_F_SUPPORT_NOWAIT_BIT,
7b29f92d 792 REQ_F_ISREG_BIT,
84557871
JA
793
794 /* not a real bit, just to check we're not overflowing the space */
795 __REQ_F_LAST_BIT,
6b47ee6e
PB
796};
797
798enum {
799 /* ctx owns file */
800 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
801 /* drain existing IO first */
802 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
803 /* linked sqes */
804 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
805 /* doesn't sever on completion < 0 */
806 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
807 /* IOSQE_ASYNC */
808 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
bcda7baa
JA
809 /* IOSQE_BUFFER_SELECT */
810 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
04c76b41
PB
811 /* IOSQE_CQE_SKIP_SUCCESS */
812 REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
6b47ee6e 813
6b47ee6e 814 /* fail rest of links */
93d2bcd2 815 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
b05a1bcd 816 /* on inflight list, should be cancelled and waited on exit reliably */
6b47ee6e
PB
817 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
818 /* read/write uses file position */
819 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
820 /* must not punt to workers */
821 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
900fad45 822 /* has or had linked timeout */
6b47ee6e 823 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
99bc4c38
PB
824 /* needs cleanup */
825 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
d7718a9d
JA
826 /* already went through poll handler */
827 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
bcda7baa
JA
828 /* buffer already selected */
829 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
e342c807
PB
830 /* completion is deferred through io_comp_state */
831 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
230d50d4
JA
832 /* caller should reissue async */
833 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
35645ac3
PB
834 /* supports async reads/writes */
835 REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
7b29f92d
JA
836 /* regular file */
837 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
b8e64b53
PB
838 /* has creds assigned */
839 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
20e60a38
PB
840 /* skip refcounting if not set */
841 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
4d13d1a4
PB
842 /* there is a linked timeout that has to be armed */
843 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
d886e185
PB
844 /* ->async_data allocated */
845 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
04c76b41
PB
846 /* don't post CQEs while failing linked requests */
847 REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
91eac1c6
JA
848 /* single poll may be active */
849 REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
850 /* double poll may active */
851 REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
8a3e8ee5
JA
852 /* request has already done partial IO */
853 REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
d7718a9d
JA
854};
855
856struct async_poll {
857 struct io_poll_iocb poll;
807abcb0 858 struct io_poll_iocb *double_poll;
6b47ee6e
PB
859};
860
f237c30a 861typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
5b0a6acc 862
7cbf1722 863struct io_task_work {
5b0a6acc
PB
864 union {
865 struct io_wq_work_node node;
866 struct llist_node fallback_node;
867 };
868 io_req_tw_func_t func;
7cbf1722
JA
869};
870
992da01a
PB
871enum {
872 IORING_RSRC_FILE = 0,
873 IORING_RSRC_BUFFER = 1,
874};
875
cef216fc
PB
876struct io_cqe {
877 __u64 user_data;
878 __s32 res;
879 /* fd initially, then cflags for completion */
880 union {
881 __u32 flags;
882 int fd;
883 };
884};
885
09bb8394
JA
886/*
887 * NOTE! Each of the iocb union members has the file pointer
888 * as the first entry in their struct definition. So you can
889 * access the file pointer through any of the sub-structs,
63c36549 890 * or directly as just 'file' in this struct.
09bb8394 891 */
2b188cc1 892struct io_kiocb {
221c5eb2 893 union {
09bb8394 894 struct file *file;
9adbd45d 895 struct io_rw rw;
221c5eb2 896 struct io_poll_iocb poll;
9d805892 897 struct io_poll_update poll_update;
8ed8d3c3
JA
898 struct io_accept accept;
899 struct io_sync sync;
fbf23849 900 struct io_cancel cancel;
b29472ee 901 struct io_timeout timeout;
0bdf7a2d 902 struct io_timeout_rem timeout_rem;
3fbb51c1 903 struct io_connect connect;
e47293fd 904 struct io_sr_msg sr_msg;
15b71abe 905 struct io_open open;
b5dba59e 906 struct io_close close;
269bbe5f 907 struct io_rsrc_update rsrc_update;
4840e418 908 struct io_fadvise fadvise;
c1ca757b 909 struct io_madvise madvise;
3e4827b0 910 struct io_epoll epoll;
7d67af2c 911 struct io_splice splice;
ddf0322d 912 struct io_provide_buf pbuf;
1d9e1288 913 struct io_statx statx;
36f4fa68 914 struct io_shutdown shutdown;
80a261fd 915 struct io_rename rename;
14a1143b 916 struct io_unlink unlink;
e34a02dc 917 struct io_mkdir mkdir;
7a8721f8 918 struct io_symlink symlink;
cf30da90 919 struct io_hardlink hardlink;
4f57f06c 920 struct io_msg msg;
221c5eb2 921 };
2b188cc1 922
d625c6ee 923 u8 opcode;
65a6543d
XW
924 /* polled IO has completed */
925 u8 iopoll_completed;
4f4eeba8 926 u16 buf_index;
d17e56eb
PB
927 unsigned int flags;
928
cef216fc 929 struct io_cqe cqe;
4f4eeba8 930
010e8e6b 931 struct io_ring_ctx *ctx;
010e8e6b 932 struct task_struct *task;
d7718a9d 933
269bbe5f 934 struct percpu_ref *fixed_rsrc_refs;
d886e185
PB
935 /* store used ubuf, so we can prevent reloading */
936 struct io_mapped_ubuf *imu;
fcb323cc 937
2804ecd8
JA
938 union {
939 /* used by request caches, completion batching and iopoll */
940 struct io_wq_work_node comp_list;
941 /* cache ->apoll->events */
942 int apoll_events;
943 };
d17e56eb 944 atomic_t refs;
521d61fc 945 atomic_t poll_refs;
5b0a6acc 946 struct io_task_work io_task_work;
010e8e6b
PB
947 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
948 struct hlist_node hash_node;
7e3709d5 949 /* internal polling, see IORING_FEAT_FAST_POLL */
010e8e6b 950 struct async_poll *apoll;
d886e185
PB
951 /* opcode allocated if it needs to store data for async defer */
952 void *async_data;
7e3709d5 953 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
30d51dd4 954 struct io_buffer *kbuf;
41cdcc22 955 /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
34d2bfe7 956 struct io_kiocb *link;
41cdcc22 957 /* custom credentials, valid IFF REQ_F_CREDS is set */
521d61fc
JA
958 const struct cred *creds;
959 struct io_wq_work work;
2b188cc1 960};
05589553 961
13bf43f5
PB
962struct io_tctx_node {
963 struct list_head ctx_node;
964 struct task_struct *task;
13bf43f5
PB
965 struct io_ring_ctx *ctx;
966};
967
27dc8338
PB
968struct io_defer_entry {
969 struct list_head list;
970 struct io_kiocb *req;
9cf7c104 971 u32 seq;
2b188cc1
JA
972};
973
d3656344 974struct io_op_def {
d3656344
JA
975 /* needs req->file assigned */
976 unsigned needs_file : 1;
6d63416d
PB
977 /* should block plug */
978 unsigned plug : 1;
d3656344
JA
979 /* hash wq insertion if file is a regular file */
980 unsigned hash_reg_file : 1;
981 /* unbound wq insertion if file is a non-regular file */
982 unsigned unbound_nonreg_file : 1;
8a72758c
JA
983 /* set if opcode supports polled "wait" */
984 unsigned pollin : 1;
985 unsigned pollout : 1;
52dd8640 986 unsigned poll_exclusive : 1;
bcda7baa
JA
987 /* op supports buffer selection */
988 unsigned buffer_select : 1;
26f0505a
PB
989 /* do prep async if is going to be punted */
990 unsigned needs_async_setup : 1;
6d63416d
PB
991 /* opcode is not supported by this kernel */
992 unsigned not_supported : 1;
5bd2182d
PM
993 /* skip auditing */
994 unsigned audit_skip : 1;
e8c2bc1f
JA
995 /* size of async data needed, if any */
996 unsigned short async_size;
d3656344
JA
997};
998
0918682b 999static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
1000 [IORING_OP_NOP] = {},
1001 [IORING_OP_READV] = {
d3656344
JA
1002 .needs_file = 1,
1003 .unbound_nonreg_file = 1,
8a72758c 1004 .pollin = 1,
4d954c25 1005 .buffer_select = 1,
26f0505a 1006 .needs_async_setup = 1,
27926b68 1007 .plug = 1,
5bd2182d 1008 .audit_skip = 1,
e8c2bc1f 1009 .async_size = sizeof(struct io_async_rw),
d3656344 1010 },
0463b6c5 1011 [IORING_OP_WRITEV] = {
d3656344
JA
1012 .needs_file = 1,
1013 .hash_reg_file = 1,
1014 .unbound_nonreg_file = 1,
8a72758c 1015 .pollout = 1,
26f0505a 1016 .needs_async_setup = 1,
27926b68 1017 .plug = 1,
5bd2182d 1018 .audit_skip = 1,
e8c2bc1f 1019 .async_size = sizeof(struct io_async_rw),
d3656344 1020 },
0463b6c5 1021 [IORING_OP_FSYNC] = {
d3656344 1022 .needs_file = 1,
5bd2182d 1023 .audit_skip = 1,
d3656344 1024 },
0463b6c5 1025 [IORING_OP_READ_FIXED] = {
d3656344
JA
1026 .needs_file = 1,
1027 .unbound_nonreg_file = 1,
8a72758c 1028 .pollin = 1,
27926b68 1029 .plug = 1,
5bd2182d 1030 .audit_skip = 1,
e8c2bc1f 1031 .async_size = sizeof(struct io_async_rw),
d3656344 1032 },
0463b6c5 1033 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
1034 .needs_file = 1,
1035 .hash_reg_file = 1,
1036 .unbound_nonreg_file = 1,
8a72758c 1037 .pollout = 1,
27926b68 1038 .plug = 1,
5bd2182d 1039 .audit_skip = 1,
e8c2bc1f 1040 .async_size = sizeof(struct io_async_rw),
d3656344 1041 },
0463b6c5 1042 [IORING_OP_POLL_ADD] = {
d3656344
JA
1043 .needs_file = 1,
1044 .unbound_nonreg_file = 1,
5bd2182d
PM
1045 .audit_skip = 1,
1046 },
1047 [IORING_OP_POLL_REMOVE] = {
1048 .audit_skip = 1,
d3656344 1049 },
0463b6c5 1050 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344 1051 .needs_file = 1,
5bd2182d 1052 .audit_skip = 1,
d3656344 1053 },
0463b6c5 1054 [IORING_OP_SENDMSG] = {
d3656344
JA
1055 .needs_file = 1,
1056 .unbound_nonreg_file = 1,
8a72758c 1057 .pollout = 1,
26f0505a 1058 .needs_async_setup = 1,
e8c2bc1f 1059 .async_size = sizeof(struct io_async_msghdr),
d3656344 1060 },
0463b6c5 1061 [IORING_OP_RECVMSG] = {
d3656344
JA
1062 .needs_file = 1,
1063 .unbound_nonreg_file = 1,
8a72758c 1064 .pollin = 1,
52de1fe1 1065 .buffer_select = 1,
26f0505a 1066 .needs_async_setup = 1,
e8c2bc1f 1067 .async_size = sizeof(struct io_async_msghdr),
d3656344 1068 },
0463b6c5 1069 [IORING_OP_TIMEOUT] = {
5bd2182d 1070 .audit_skip = 1,
e8c2bc1f 1071 .async_size = sizeof(struct io_timeout_data),
d3656344 1072 },
9c8e11b3
PB
1073 [IORING_OP_TIMEOUT_REMOVE] = {
1074 /* used by timeout updates' prep() */
5bd2182d 1075 .audit_skip = 1,
9c8e11b3 1076 },
0463b6c5 1077 [IORING_OP_ACCEPT] = {
d3656344
JA
1078 .needs_file = 1,
1079 .unbound_nonreg_file = 1,
8a72758c 1080 .pollin = 1,
52dd8640 1081 .poll_exclusive = 1,
d3656344 1082 },
5bd2182d
PM
1083 [IORING_OP_ASYNC_CANCEL] = {
1084 .audit_skip = 1,
1085 },
0463b6c5 1086 [IORING_OP_LINK_TIMEOUT] = {
5bd2182d 1087 .audit_skip = 1,
e8c2bc1f 1088 .async_size = sizeof(struct io_timeout_data),
d3656344 1089 },
0463b6c5 1090 [IORING_OP_CONNECT] = {
d3656344
JA
1091 .needs_file = 1,
1092 .unbound_nonreg_file = 1,
8a72758c 1093 .pollout = 1,
26f0505a 1094 .needs_async_setup = 1,
e8c2bc1f 1095 .async_size = sizeof(struct io_async_connect),
d3656344 1096 },
0463b6c5 1097 [IORING_OP_FALLOCATE] = {
d3656344 1098 .needs_file = 1,
d3656344 1099 },
44526bed
JA
1100 [IORING_OP_OPENAT] = {},
1101 [IORING_OP_CLOSE] = {},
5bd2182d
PM
1102 [IORING_OP_FILES_UPDATE] = {
1103 .audit_skip = 1,
1104 },
1105 [IORING_OP_STATX] = {
1106 .audit_skip = 1,
1107 },
0463b6c5 1108 [IORING_OP_READ] = {
3a6820f2
JA
1109 .needs_file = 1,
1110 .unbound_nonreg_file = 1,
8a72758c 1111 .pollin = 1,
bcda7baa 1112 .buffer_select = 1,
27926b68 1113 .plug = 1,
5bd2182d 1114 .audit_skip = 1,
e8c2bc1f 1115 .async_size = sizeof(struct io_async_rw),
3a6820f2 1116 },
0463b6c5 1117 [IORING_OP_WRITE] = {
3a6820f2 1118 .needs_file = 1,
7b3188e7 1119 .hash_reg_file = 1,
3a6820f2 1120 .unbound_nonreg_file = 1,
8a72758c 1121 .pollout = 1,
27926b68 1122 .plug = 1,
5bd2182d 1123 .audit_skip = 1,
e8c2bc1f 1124 .async_size = sizeof(struct io_async_rw),
3a6820f2 1125 },
0463b6c5 1126 [IORING_OP_FADVISE] = {
4840e418 1127 .needs_file = 1,
5bd2182d 1128 .audit_skip = 1,
c1ca757b 1129 },
44526bed 1130 [IORING_OP_MADVISE] = {},
0463b6c5 1131 [IORING_OP_SEND] = {
fddaface
JA
1132 .needs_file = 1,
1133 .unbound_nonreg_file = 1,
8a72758c 1134 .pollout = 1,
5bd2182d 1135 .audit_skip = 1,
fddaface 1136 },
0463b6c5 1137 [IORING_OP_RECV] = {
fddaface
JA
1138 .needs_file = 1,
1139 .unbound_nonreg_file = 1,
8a72758c 1140 .pollin = 1,
bcda7baa 1141 .buffer_select = 1,
5bd2182d 1142 .audit_skip = 1,
fddaface 1143 },
0463b6c5 1144 [IORING_OP_OPENAT2] = {
cebdb986 1145 },
3e4827b0
JA
1146 [IORING_OP_EPOLL_CTL] = {
1147 .unbound_nonreg_file = 1,
5bd2182d 1148 .audit_skip = 1,
3e4827b0 1149 },
7d67af2c
PB
1150 [IORING_OP_SPLICE] = {
1151 .needs_file = 1,
1152 .hash_reg_file = 1,
1153 .unbound_nonreg_file = 1,
5bd2182d
PM
1154 .audit_skip = 1,
1155 },
1156 [IORING_OP_PROVIDE_BUFFERS] = {
1157 .audit_skip = 1,
1158 },
1159 [IORING_OP_REMOVE_BUFFERS] = {
1160 .audit_skip = 1,
ddf0322d 1161 },
f2a8d5c7
PB
1162 [IORING_OP_TEE] = {
1163 .needs_file = 1,
1164 .hash_reg_file = 1,
1165 .unbound_nonreg_file = 1,
5bd2182d 1166 .audit_skip = 1,
f2a8d5c7 1167 },
36f4fa68
JA
1168 [IORING_OP_SHUTDOWN] = {
1169 .needs_file = 1,
1170 },
44526bed
JA
1171 [IORING_OP_RENAMEAT] = {},
1172 [IORING_OP_UNLINKAT] = {},
e34a02dc 1173 [IORING_OP_MKDIRAT] = {},
7a8721f8 1174 [IORING_OP_SYMLINKAT] = {},
cf30da90 1175 [IORING_OP_LINKAT] = {},
4f57f06c
JA
1176 [IORING_OP_MSG_RING] = {
1177 .needs_file = 1,
1178 },
d3656344
JA
1179};
1180
0756a869
PB
1181/* requests with any of those set should undergo io_disarm_next() */
1182#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
da1a08c5 1183#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
0756a869 1184
7a612350 1185static bool io_disarm_next(struct io_kiocb *req);
eef51daa 1186static void io_uring_del_tctx_node(unsigned long index);
9936c7c2
PB
1187static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1188 struct task_struct *task,
3dd0c97a 1189 bool cancel_all);
78cc687b 1190static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1ffc5422 1191
4e118cd9 1192static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
c7dae4ba 1193static void io_dismantle_req(struct io_kiocb *req);
94ae5e77 1194static void io_queue_linked_timeout(struct io_kiocb *req);
fdecb662 1195static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 1196 struct io_uring_rsrc_update2 *up,
98f0b3b4 1197 unsigned nr_args);
68fb8979 1198static void io_clean_op(struct io_kiocb *req);
5106dd6e
JA
1199static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
1200 unsigned issue_flags);
1201static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
d5361233
JA
1202static void io_drop_inflight_file(struct io_kiocb *req);
1203static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
cbc2e203 1204static void io_queue_sqe(struct io_kiocb *req);
269bbe5f 1205static void io_rsrc_put_work(struct work_struct *work);
de0617e4 1206
907d1df3 1207static void io_req_task_queue(struct io_kiocb *req);
c450178d 1208static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
179ae0d1 1209static int io_req_prep_async(struct io_kiocb *req);
de0617e4 1210
b9445598
PB
1211static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1212 unsigned int issue_flags, u32 slot_index);
7df778be
PB
1213static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1214
f1042b6c 1215static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
9aa8dfde 1216static void io_eventfd_signal(struct io_ring_ctx *ctx);
4e118cd9 1217static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
b9445598 1218
2b188cc1
JA
1219static struct kmem_cache *req_cachep;
1220
0918682b 1221static const struct file_operations io_uring_fops;
2b188cc1
JA
1222
1223struct sock *io_uring_get_socket(struct file *file)
1224{
1225#if defined(CONFIG_UNIX)
1226 if (file->f_op == &io_uring_fops) {
1227 struct io_ring_ctx *ctx = file->private_data;
1228
1229 return ctx->ring_sock->sk;
1230 }
1231#endif
1232 return NULL;
1233}
1234EXPORT_SYMBOL(io_uring_get_socket);
1235
1f59bc0f
PB
1236#if defined(CONFIG_UNIX)
1237static inline bool io_file_need_scm(struct file *filp)
1238{
1239 return !!unix_get_socket(filp);
1240}
1241#else
1242static inline bool io_file_need_scm(struct file *filp)
1243{
1244 return 0;
1245}
1246#endif
1247
f8929630
PB
1248static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
1249{
1250 lockdep_assert_held(&ctx->uring_lock);
1251 if (issue_flags & IO_URING_F_UNLOCKED)
1252 mutex_unlock(&ctx->uring_lock);
1253}
1254
1255static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
1256{
1257 /*
1258 * "Normal" inline submissions always hold the uring_lock, since we
1259 * grab it from the system call. Same is true for the SQPOLL offload.
1260 * The only exception is when we've detached the request and issue it
1261 * from an async worker thread, grab the lock for that case.
1262 */
1263 if (issue_flags & IO_URING_F_UNLOCKED)
1264 mutex_lock(&ctx->uring_lock);
1265 lockdep_assert_held(&ctx->uring_lock);
1266}
1267
f237c30a
PB
1268static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1269{
1270 if (!*locked) {
1271 mutex_lock(&ctx->uring_lock);
1272 *locked = true;
1273 }
1274}
1275
f2f87370
PB
1276#define io_for_each_link(pos, head) \
1277 for (pos = (head); pos; pos = pos->link)
1278
21c843d5
PB
1279/*
1280 * Shamelessly stolen from the mm implementation of page reference checking,
1281 * see commit f958d7b528b1 for details.
1282 */
1283#define req_ref_zero_or_close_to_overflow(req) \
1284 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1285
1286static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1287{
20e60a38 1288 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1289 return atomic_inc_not_zero(&req->refs);
1290}
1291
21c843d5
PB
1292static inline bool req_ref_put_and_test(struct io_kiocb *req)
1293{
20e60a38
PB
1294 if (likely(!(req->flags & REQ_F_REFCOUNT)))
1295 return true;
1296
21c843d5
PB
1297 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1298 return atomic_dec_and_test(&req->refs);
1299}
1300
21c843d5
PB
1301static inline void req_ref_get(struct io_kiocb *req)
1302{
20e60a38 1303 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1304 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1305 atomic_inc(&req->refs);
1306}
1307
c450178d
PB
1308static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1309{
6f33b0bc 1310 if (!wq_list_empty(&ctx->submit_state.compl_reqs))
c450178d
PB
1311 __io_submit_flush_completions(ctx);
1312}
1313
48dcd38d 1314static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
20e60a38
PB
1315{
1316 if (!(req->flags & REQ_F_REFCOUNT)) {
1317 req->flags |= REQ_F_REFCOUNT;
48dcd38d 1318 atomic_set(&req->refs, nr);
20e60a38
PB
1319 }
1320}
1321
48dcd38d
PB
1322static inline void io_req_set_refcount(struct io_kiocb *req)
1323{
1324 __io_req_set_refcount(req, 1);
1325}
1326
ab409402
PB
1327#define IO_RSRC_REF_BATCH 100
1328
1329static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
1330 struct io_ring_ctx *ctx)
1331 __must_hold(&ctx->uring_lock)
36f72fe2 1332{
ab409402
PB
1333 struct percpu_ref *ref = req->fixed_rsrc_refs;
1334
1335 if (ref) {
1336 if (ref == &ctx->rsrc_node->refs)
1337 ctx->rsrc_cached_refs++;
1338 else
1339 percpu_ref_put(ref);
1340 }
1341}
1342
1343static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
1344{
1345 if (req->fixed_rsrc_refs)
1346 percpu_ref_put(req->fixed_rsrc_refs);
1347}
1348
1349static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
1350 __must_hold(&ctx->uring_lock)
1351{
1352 if (ctx->rsrc_cached_refs) {
1353 percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
1354 ctx->rsrc_cached_refs = 0;
1355 }
1356}
1357
1358static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
1359 __must_hold(&ctx->uring_lock)
1360{
1361 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
1362 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
1363}
36f72fe2 1364
a46be971 1365static inline void io_req_set_rsrc_node(struct io_kiocb *req,
5106dd6e
JA
1366 struct io_ring_ctx *ctx,
1367 unsigned int issue_flags)
36f72fe2 1368{
269bbe5f 1369 if (!req->fixed_rsrc_refs) {
a7f0ed5a 1370 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
5106dd6e
JA
1371
1372 if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1373 lockdep_assert_held(&ctx->uring_lock);
1374 ctx->rsrc_cached_refs--;
1375 if (unlikely(ctx->rsrc_cached_refs < 0))
1376 io_rsrc_refs_refill(ctx);
1377 } else {
1378 percpu_ref_get(req->fixed_rsrc_refs);
1379 }
36f72fe2
PB
1380 }
1381}
1382
cc3cec83 1383static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
3648e526 1384{
d1fd1c20 1385 struct io_buffer *kbuf = req->kbuf;
3648e526
HX
1386 unsigned int cflags;
1387
cc3cec83 1388 cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
3648e526 1389 req->flags &= ~REQ_F_BUFFER_SELECTED;
cc3cec83 1390 list_add(&kbuf->list, list);
d1fd1c20 1391 req->kbuf = NULL;
3648e526
HX
1392 return cflags;
1393}
1394
cc3cec83 1395static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
3648e526 1396{
8197b053
PB
1397 lockdep_assert_held(&req->ctx->completion_lock);
1398
3648e526
HX
1399 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1400 return 0;
cc3cec83
JA
1401 return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
1402}
1403
1404static inline unsigned int io_put_kbuf(struct io_kiocb *req,
1405 unsigned issue_flags)
1406{
1407 unsigned int cflags;
1408
1409 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1410 return 0;
1411
1412 /*
1413 * We can add this buffer back to two lists:
1414 *
1415 * 1) The io_buffers_cache list. This one is protected by the
1416 * ctx->uring_lock. If we already hold this lock, add back to this
1417 * list as we can grab it from issue as well.
1418 * 2) The io_buffers_comp list. This one is protected by the
1419 * ctx->completion_lock.
1420 *
1421 * We migrate buffers from the comp_list to the issue cache list
1422 * when we need one.
1423 */
1424 if (issue_flags & IO_URING_F_UNLOCKED) {
1425 struct io_ring_ctx *ctx = req->ctx;
1426
1427 spin_lock(&ctx->completion_lock);
1428 cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
1429 spin_unlock(&ctx->completion_lock);
1430 } else {
ab0ac095
PB
1431 lockdep_assert_held(&req->ctx->uring_lock);
1432
cc3cec83
JA
1433 cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
1434 }
1435
1436 return cflags;
3648e526
HX
1437}
1438
dbc7d452
JA
1439static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
1440 unsigned int bgid)
1441{
1442 struct list_head *hash_list;
1443 struct io_buffer_list *bl;
1444
1445 hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
1446 list_for_each_entry(bl, hash_list, list)
1447 if (bl->bgid == bgid || bgid == -1U)
1448 return bl;
1449
1450 return NULL;
1451}
1452
4d55f238 1453static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
b1c62645
JA
1454{
1455 struct io_ring_ctx *ctx = req->ctx;
dbc7d452
JA
1456 struct io_buffer_list *bl;
1457 struct io_buffer *buf;
b1c62645
JA
1458
1459 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1460 return;
8a3e8ee5
JA
1461 /* don't recycle if we already did IO to this buffer */
1462 if (req->flags & REQ_F_PARTIAL_IO)
1463 return;
b1c62645 1464
f8929630 1465 io_ring_submit_lock(ctx, issue_flags);
b1c62645
JA
1466
1467 buf = req->kbuf;
dbc7d452
JA
1468 bl = io_buffer_get_list(ctx, buf->bgid);
1469 list_add(&buf->list, &bl->buf_list);
b1c62645
JA
1470 req->flags &= ~REQ_F_BUFFER_SELECTED;
1471 req->kbuf = NULL;
4d55f238 1472
f8929630 1473 io_ring_submit_unlock(ctx, issue_flags);
b1c62645
JA
1474}
1475
3dd0c97a
PB
1476static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1477 bool cancel_all)
6af3f48b 1478 __must_hold(&req->ctx->timeout_lock)
08d23634 1479{
68207680 1480 if (task && head->task != task)
08d23634 1481 return false;
d5361233 1482 return cancel_all;
6af3f48b
PB
1483}
1484
1485/*
1486 * As io_match_task() but protected against racing with linked timeouts.
1487 * User must not hold timeout_lock.
1488 */
1489static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
1490 bool cancel_all)
1491{
6af3f48b
PB
1492 if (task && head->task != task)
1493 return false;
d5361233 1494 return cancel_all;
6af3f48b
PB
1495}
1496
d886e185
PB
1497static inline bool req_has_async_data(struct io_kiocb *req)
1498{
1499 return req->flags & REQ_F_ASYNC_DATA;
1500}
1501
93d2bcd2 1502static inline void req_set_fail(struct io_kiocb *req)
c40f6379 1503{
93d2bcd2 1504 req->flags |= REQ_F_FAIL;
04c76b41
PB
1505 if (req->flags & REQ_F_CQE_SKIP) {
1506 req->flags &= ~REQ_F_CQE_SKIP;
1507 req->flags |= REQ_F_SKIP_LINK_CQES;
1508 }
c40f6379 1509}
4a38aed2 1510
a8295b98
HX
1511static inline void req_fail_link_node(struct io_kiocb *req, int res)
1512{
1513 req_set_fail(req);
cef216fc 1514 req->cqe.res = res;
a8295b98
HX
1515}
1516
fa05457a
PB
1517static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
1518{
1519 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
1520}
1521
c072481d 1522static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
2b188cc1
JA
1523{
1524 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1525
0f158b4c 1526 complete(&ctx->ref_comp);
2b188cc1
JA
1527}
1528
8eb7e2d0
PB
1529static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1530{
1531 return !req->timeout.off;
1532}
1533
c072481d 1534static __cold void io_fallback_req_func(struct work_struct *work)
f56165e6
PB
1535{
1536 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1537 fallback_work.work);
1538 struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1539 struct io_kiocb *req, *tmp;
f237c30a 1540 bool locked = false;
f56165e6
PB
1541
1542 percpu_ref_get(&ctx->refs);
1543 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
f237c30a 1544 req->io_task_work.func(req, &locked);
5636c00d 1545
f237c30a 1546 if (locked) {
c450178d 1547 io_submit_flush_completions(ctx);
f237c30a
PB
1548 mutex_unlock(&ctx->uring_lock);
1549 }
f56165e6
PB
1550 percpu_ref_put(&ctx->refs);
1551}
1552
c072481d 1553static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
2b188cc1
JA
1554{
1555 struct io_ring_ctx *ctx;
dbc7d452 1556 int i, hash_bits;
2b188cc1
JA
1557
1558 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1559 if (!ctx)
1560 return NULL;
1561
78076bb6
JA
1562 /*
1563 * Use 5 bits less than the max cq entries, that should give us around
1564 * 32 entries per hash list if totally full and uniformly spread.
1565 */
1566 hash_bits = ilog2(p->cq_entries);
1567 hash_bits -= 5;
1568 if (hash_bits <= 0)
1569 hash_bits = 1;
1570 ctx->cancel_hash_bits = hash_bits;
1571 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1572 GFP_KERNEL);
1573 if (!ctx->cancel_hash)
1574 goto err;
1575 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1576
6224843d
PB
1577 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1578 if (!ctx->dummy_ubuf)
1579 goto err;
1580 /* set invalid range, so io_import_fixed() fails meeting it */
1581 ctx->dummy_ubuf->ubuf = -1UL;
1582
dbc7d452
JA
1583 ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
1584 sizeof(struct list_head), GFP_KERNEL);
1585 if (!ctx->io_buffers)
1586 goto err;
1587 for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
1588 INIT_LIST_HEAD(&ctx->io_buffers[i]);
1589
21482896 1590 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
1591 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1592 goto err;
2b188cc1
JA
1593
1594 ctx->flags = p->flags;
90554200 1595 init_waitqueue_head(&ctx->sqo_sq_wait);
69fb2131 1596 INIT_LIST_HEAD(&ctx->sqd_list);
1d7bb1d5 1597 INIT_LIST_HEAD(&ctx->cq_overflow_list);
cc3cec83 1598 INIT_LIST_HEAD(&ctx->io_buffers_cache);
4d9237e3 1599 INIT_LIST_HEAD(&ctx->apoll_cache);
0f158b4c 1600 init_completion(&ctx->ref_comp);
61cf9370 1601 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
2b188cc1 1602 mutex_init(&ctx->uring_lock);
311997b3 1603 init_waitqueue_head(&ctx->cq_wait);
2b188cc1 1604 spin_lock_init(&ctx->completion_lock);
89850fce 1605 spin_lock_init(&ctx->timeout_lock);
5eef4e87 1606 INIT_WQ_LIST(&ctx->iopoll_list);
cc3cec83
JA
1607 INIT_LIST_HEAD(&ctx->io_buffers_pages);
1608 INIT_LIST_HEAD(&ctx->io_buffers_comp);
de0617e4 1609 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 1610 INIT_LIST_HEAD(&ctx->timeout_list);
ef9dd637 1611 INIT_LIST_HEAD(&ctx->ltimeout_list);
d67d2263
BM
1612 spin_lock_init(&ctx->rsrc_ref_lock);
1613 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
269bbe5f
BM
1614 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1615 init_llist_head(&ctx->rsrc_put_llist);
13bf43f5 1616 INIT_LIST_HEAD(&ctx->tctx_list);
c2b6c6bc
PB
1617 ctx->submit_state.free_list.next = NULL;
1618 INIT_WQ_LIST(&ctx->locked_free_list);
9011bf9a 1619 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
6f33b0bc 1620 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
2b188cc1 1621 return ctx;
206aefde 1622err:
6224843d 1623 kfree(ctx->dummy_ubuf);
78076bb6 1624 kfree(ctx->cancel_hash);
dbc7d452 1625 kfree(ctx->io_buffers);
206aefde
JA
1626 kfree(ctx);
1627 return NULL;
2b188cc1
JA
1628}
1629
8f6ed49a
PB
1630static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1631{
1632 struct io_rings *r = ctx->rings;
1633
1634 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1635 ctx->cq_extra--;
1636}
1637
9cf7c104 1638static bool req_need_defer(struct io_kiocb *req, u32 seq)
7adf4eaf 1639{
2bc9930e
JA
1640 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1641 struct io_ring_ctx *ctx = req->ctx;
a197f664 1642
8f6ed49a 1643 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
2bc9930e 1644 }
de0617e4 1645
9d858b21 1646 return false;
de0617e4
JA
1647}
1648
35645ac3
PB
1649#define FFS_NOWAIT 0x1UL
1650#define FFS_ISREG 0x2UL
1651#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
c97d8a0f
PB
1652
1653static inline bool io_req_ffs_set(struct io_kiocb *req)
1654{
35645ac3 1655 return req->flags & REQ_F_FIXED_FILE;
c97d8a0f
PB
1656}
1657
fd08e530
PB
1658static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1659{
906c6caa
PB
1660 if (WARN_ON_ONCE(!req->link))
1661 return NULL;
1662
4d13d1a4
PB
1663 req->flags &= ~REQ_F_ARM_LTIMEOUT;
1664 req->flags |= REQ_F_LINK_TIMEOUT;
fd08e530
PB
1665
1666 /* linked timeouts should have two refs once prep'ed */
48dcd38d 1667 io_req_set_refcount(req);
4d13d1a4
PB
1668 __io_req_set_refcount(req->link, 2);
1669 return req->link;
fd08e530
PB
1670}
1671
1672static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1673{
4d13d1a4 1674 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
fd08e530
PB
1675 return NULL;
1676 return __io_prep_linked_timeout(req);
1677}
1678
cb2d344c
PB
1679static noinline void __io_arm_ltimeout(struct io_kiocb *req)
1680{
1681 io_queue_linked_timeout(__io_prep_linked_timeout(req));
1682}
1683
1684static inline void io_arm_ltimeout(struct io_kiocb *req)
1685{
1686 if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
1687 __io_arm_ltimeout(req);
1688}
1689
1e6fa521
JA
1690static void io_prep_async_work(struct io_kiocb *req)
1691{
1692 const struct io_op_def *def = &io_op_defs[req->opcode];
1e6fa521
JA
1693 struct io_ring_ctx *ctx = req->ctx;
1694
b8e64b53
PB
1695 if (!(req->flags & REQ_F_CREDS)) {
1696 req->flags |= REQ_F_CREDS;
c10d1f98 1697 req->creds = get_current_cred();
b8e64b53 1698 }
003e8dcc 1699
e1d675df
PB
1700 req->work.list.next = NULL;
1701 req->work.flags = 0;
feaadc4f
PB
1702 if (req->flags & REQ_F_FORCE_ASYNC)
1703 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1704
1e6fa521
JA
1705 if (req->flags & REQ_F_ISREG) {
1706 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1707 io_wq_hash_work(&req->work, file_inode(req->file));
4b982bd0 1708 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1e6fa521
JA
1709 if (def->unbound_nonreg_file)
1710 req->work.flags |= IO_WQ_WORK_UNBOUND;
1711 }
561fb04a 1712}
cccf0ee8 1713
cbdcb435 1714static void io_prep_async_link(struct io_kiocb *req)
561fb04a 1715{
cbdcb435 1716 struct io_kiocb *cur;
54a91f3b 1717
44eff40a
PB
1718 if (req->flags & REQ_F_LINK_TIMEOUT) {
1719 struct io_ring_ctx *ctx = req->ctx;
1720
674ee8e1 1721 spin_lock_irq(&ctx->timeout_lock);
44eff40a
PB
1722 io_for_each_link(cur, req)
1723 io_prep_async_work(cur);
674ee8e1 1724 spin_unlock_irq(&ctx->timeout_lock);
44eff40a
PB
1725 } else {
1726 io_for_each_link(cur, req)
1727 io_prep_async_work(cur);
1728 }
561fb04a
JA
1729}
1730
fff4e40e
PB
1731static inline void io_req_add_compl_list(struct io_kiocb *req)
1732{
775a1f2f 1733 struct io_submit_state *state = &req->ctx->submit_state;
fff4e40e 1734
3d4aeb9f 1735 if (!(req->flags & REQ_F_CQE_SKIP))
775a1f2f 1736 state->flush_cqes = true;
fff4e40e
PB
1737 wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1738}
1739
77955efb 1740static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
561fb04a 1741{
cbdcb435 1742 struct io_kiocb *link = io_prep_linked_timeout(req);
5aa75ed5 1743 struct io_uring_task *tctx = req->task->io_uring;
561fb04a 1744
3bfe6106
JA
1745 BUG_ON(!tctx);
1746 BUG_ON(!tctx->io_wq);
561fb04a 1747
cbdcb435
PB
1748 /* init ->work of the whole link before punting */
1749 io_prep_async_link(req);
991468dc
JA
1750
1751 /*
1752 * Not expected to happen, but if we do have a bug where this _can_
1753 * happen, catch it here and ensure the request is marked as
1754 * canceled. That will make io-wq go through the usual work cancel
1755 * procedure rather than attempt to run this request (or create a new
1756 * worker for it).
1757 */
1758 if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1759 req->work.flags |= IO_WQ_WORK_CANCEL;
1760
971cf9c1
PB
1761 trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
1762 req->opcode, req->flags, &req->work,
1763 io_wq_is_hashed(&req->work));
ebf93667 1764 io_wq_enqueue(tctx->io_wq, &req->work);
7271ef3a
JA
1765 if (link)
1766 io_queue_linked_timeout(link);
cbdcb435
PB
1767}
1768
1ee4160c 1769static void io_kill_timeout(struct io_kiocb *req, int status)
8c855885 1770 __must_hold(&req->ctx->completion_lock)
89850fce 1771 __must_hold(&req->ctx->timeout_lock)
5262f567 1772{
e8c2bc1f 1773 struct io_timeout_data *io = req->async_data;
5262f567 1774
fd9c7bc5 1775 if (hrtimer_try_to_cancel(&io->timer) != -1) {
2ae2eb9d
PB
1776 if (status)
1777 req_set_fail(req);
01cec8c1
PB
1778 atomic_set(&req->ctx->cq_timeouts,
1779 atomic_read(&req->ctx->cq_timeouts) + 1);
135fcde8 1780 list_del_init(&req->timeout.list);
4e118cd9 1781 io_req_tw_post_queue(req, status, 0);
5262f567
JA
1782 }
1783}
1784
c072481d 1785static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
de0617e4 1786{
441b8a78 1787 while (!list_empty(&ctx->defer_list)) {
27dc8338
PB
1788 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1789 struct io_defer_entry, list);
de0617e4 1790
9cf7c104 1791 if (req_need_defer(de->req, de->seq))
04518945 1792 break;
27dc8338 1793 list_del_init(&de->list);
907d1df3 1794 io_req_task_queue(de->req);
27dc8338 1795 kfree(de);
441b8a78 1796 }
04518945
PB
1797}
1798
c072481d 1799static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
89850fce 1800 __must_hold(&ctx->completion_lock)
de0617e4 1801{
441b8a78 1802 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
e677edbc 1803 struct io_kiocb *req, *tmp;
f010505b 1804
79ebeaee 1805 spin_lock_irq(&ctx->timeout_lock);
e677edbc 1806 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
f010505b 1807 u32 events_needed, events_got;
de0617e4 1808
8eb7e2d0 1809 if (io_is_timeout_noseq(req))
360428f8 1810 break;
f010505b
MDG
1811
1812 /*
1813 * Since seq can easily wrap around over time, subtract
1814 * the last seq at which timeouts were flushed before comparing.
1815 * Assuming not more than 2^31-1 events have happened since,
1816 * these subtractions won't have wrapped, so we can check if
1817 * target is in [last_seq, current_seq] by comparing the two.
1818 */
1819 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1820 events_got = seq - ctx->cq_last_tm_flush;
1821 if (events_got < events_needed)
360428f8 1822 break;
bfe68a22 1823
1ee4160c 1824 io_kill_timeout(req, 0);
f18ee4cf 1825 }
f010505b 1826 ctx->cq_last_tm_flush = seq;
79ebeaee 1827 spin_unlock_irq(&ctx->timeout_lock);
360428f8 1828}
5262f567 1829
9333f6b4
PB
1830static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1831{
1832 /* order cqe stores with ring update */
1833 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1834}
1835
9aa8dfde 1836static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
360428f8 1837{
9aa8dfde
PB
1838 if (ctx->off_timeout_used || ctx->drain_active) {
1839 spin_lock(&ctx->completion_lock);
1840 if (ctx->off_timeout_used)
1841 io_flush_timeouts(ctx);
1842 if (ctx->drain_active)
1843 io_queue_deferred(ctx);
1844 io_commit_cqring(ctx);
1845 spin_unlock(&ctx->completion_lock);
1846 }
1847 if (ctx->has_evfd)
1848 io_eventfd_signal(ctx);
de0617e4
JA
1849}
1850
90554200
JA
1851static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1852{
1853 struct io_rings *r = ctx->rings;
1854
a566c556 1855 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
90554200
JA
1856}
1857
888aae2e
PB
1858static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1859{
1860 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1861}
1862
d8da428b
PB
1863/*
1864 * writes to the cq entry need to come after reading head; the
1865 * control dependency is enough as we're using WRITE_ONCE to
1866 * fill the cq entry
1867 */
1868static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
2b188cc1 1869{
75b28aff 1870 struct io_rings *rings = ctx->rings;
d8da428b
PB
1871 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
1872 unsigned int free, queued, len;
1873
1874 /* userspace may cheat modifying the tail, be safe and do min */
1875 queued = min(__io_cqring_events(ctx), ctx->cq_entries);
1876 free = ctx->cq_entries - queued;
1877 /* we need a contiguous range, limit based on the current array offset */
1878 len = min(free, ctx->cq_entries - off);
1879 if (!len)
2b188cc1
JA
1880 return NULL;
1881
d8da428b
PB
1882 ctx->cached_cq_tail++;
1883 ctx->cqe_cached = &rings->cqes[off];
1884 ctx->cqe_sentinel = ctx->cqe_cached + len;
1885 return ctx->cqe_cached++;
1886}
1887
1888static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1889{
1890 if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
1891 ctx->cached_cq_tail++;
1892 return ctx->cqe_cached++;
1893 }
1894 return __io_get_cqe(ctx);
2b188cc1
JA
1895}
1896
77bc59b4 1897static void io_eventfd_signal(struct io_ring_ctx *ctx)
f2842ab5 1898{
77bc59b4
UA
1899 struct io_ev_fd *ev_fd;
1900
77bc59b4
UA
1901 rcu_read_lock();
1902 /*
1903 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
1904 * and eventfd_signal
1905 */
1906 ev_fd = rcu_dereference(ctx->io_ev_fd);
1907
1908 /*
1909 * Check again if ev_fd exists incase an io_eventfd_unregister call
1910 * completed between the NULL check of ctx->io_ev_fd at the start of
1911 * the function and rcu_read_lock.
1912 */
1913 if (unlikely(!ev_fd))
1914 goto out;
7e55a19c 1915 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
77bc59b4
UA
1916 goto out;
1917
c75312dd 1918 if (!ev_fd->eventfd_async || io_wq_current_is_worker())
77bc59b4 1919 eventfd_signal(ev_fd->cq_ev_fd, 1);
77bc59b4
UA
1920out:
1921 rcu_read_unlock();
f2842ab5
JA
1922}
1923
9aa8dfde
PB
1924static inline void io_cqring_wake(struct io_ring_ctx *ctx)
1925{
1926 /*
1927 * wake_up_all() may seem excessive, but io_wake_function() and
1928 * io_should_wake() handle the termination of the loop and only
1929 * wake as many waiters as we need to.
1930 */
1931 if (wq_has_sleeper(&ctx->cq_wait))
1932 wake_up_all(&ctx->cq_wait);
1933}
1934
2c5d763c
JA
1935/*
1936 * This should only get called when at least one event has been posted.
1937 * Some applications rely on the eventfd notification count only changing
1938 * IFF a new CQE has been added to the CQ ring. There's no depedency on
1939 * 1:1 relationship between how many times this function is called (and
1940 * hence the eventfd count) and number of CQEs posted to the CQ ring.
1941 */
66fc25ca 1942static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5 1943{
9aa8dfde
PB
1944 if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
1945 ctx->has_evfd))
9333f6b4
PB
1946 __io_commit_cqring_flush(ctx);
1947
9aa8dfde 1948 io_cqring_wake(ctx);
1d7bb1d5
JA
1949}
1950
80c18e4a
PB
1951static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1952{
9aa8dfde
PB
1953 if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
1954 ctx->has_evfd))
9333f6b4
PB
1955 __io_commit_cqring_flush(ctx);
1956
9aa8dfde
PB
1957 if (ctx->flags & IORING_SETUP_SQPOLL)
1958 io_cqring_wake(ctx);
80c18e4a
PB
1959}
1960
c4a2ed72 1961/* Returns true if there are no backlogged entries after the flush */
6c2450ae 1962static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5 1963{
b18032bb 1964 bool all_flushed, posted;
1d7bb1d5 1965
a566c556 1966 if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
e23de15f 1967 return false;
1d7bb1d5 1968
b18032bb 1969 posted = false;
79ebeaee 1970 spin_lock(&ctx->completion_lock);
6c2450ae 1971 while (!list_empty(&ctx->cq_overflow_list)) {
d068b506 1972 struct io_uring_cqe *cqe = io_get_cqe(ctx);
6c2450ae 1973 struct io_overflow_cqe *ocqe;
e6c8aa9a 1974
1d7bb1d5
JA
1975 if (!cqe && !force)
1976 break;
6c2450ae
PB
1977 ocqe = list_first_entry(&ctx->cq_overflow_list,
1978 struct io_overflow_cqe, list);
1979 if (cqe)
1980 memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1981 else
8f6ed49a
PB
1982 io_account_cq_overflow(ctx);
1983
b18032bb 1984 posted = true;
6c2450ae
PB
1985 list_del(&ocqe->list);
1986 kfree(ocqe);
1d7bb1d5
JA
1987 }
1988
09e88404
PB
1989 all_flushed = list_empty(&ctx->cq_overflow_list);
1990 if (all_flushed) {
5ed7a37d 1991 clear_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
1992 WRITE_ONCE(ctx->rings->sq_flags,
1993 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
09e88404 1994 }
46930143 1995
60053be8 1996 io_commit_cqring(ctx);
79ebeaee 1997 spin_unlock(&ctx->completion_lock);
b18032bb
JA
1998 if (posted)
1999 io_cqring_ev_posted(ctx);
09e88404 2000 return all_flushed;
1d7bb1d5
JA
2001}
2002
90f67366 2003static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
6c503150 2004{
ca0a2651
JA
2005 bool ret = true;
2006
5ed7a37d 2007 if (test_bit(0, &ctx->check_cq_overflow)) {
6c503150
PB
2008 /* iopoll syncs against uring_lock, not completion_lock */
2009 if (ctx->flags & IORING_SETUP_IOPOLL)
2010 mutex_lock(&ctx->uring_lock);
90f67366 2011 ret = __io_cqring_overflow_flush(ctx, false);
6c503150
PB
2012 if (ctx->flags & IORING_SETUP_IOPOLL)
2013 mutex_unlock(&ctx->uring_lock);
2014 }
ca0a2651
JA
2015
2016 return ret;
6c503150
PB
2017}
2018
9d170164 2019static void __io_put_task(struct task_struct *task, int nr)
6a290a14
PB
2020{
2021 struct io_uring_task *tctx = task->io_uring;
2022
9d170164
PB
2023 percpu_counter_sub(&tctx->inflight, nr);
2024 if (unlikely(atomic_read(&tctx->in_idle)))
2025 wake_up(&tctx->wait);
2026 put_task_struct_many(task, nr);
2027}
2028
2029/* must to be called somewhat shortly after putting a request */
2030static inline void io_put_task(struct task_struct *task, int nr)
2031{
2032 if (likely(task == current))
2033 task->io_uring->cached_refs += nr;
2034 else
2035 __io_put_task(task, nr);
6a290a14
PB
2036}
2037
9a10867a
PB
2038static void io_task_refs_refill(struct io_uring_task *tctx)
2039{
2040 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
2041
2042 percpu_counter_add(&tctx->inflight, refill);
2043 refcount_add(refill, &current->usage);
2044 tctx->cached_refs += refill;
2045}
2046
2047static inline void io_get_task_refs(int nr)
2048{
2049 struct io_uring_task *tctx = current->io_uring;
2050
2051 tctx->cached_refs -= nr;
2052 if (unlikely(tctx->cached_refs < 0))
2053 io_task_refs_refill(tctx);
2054}
2055
3cc7fdb9
PB
2056static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
2057{
2058 struct io_uring_task *tctx = task->io_uring;
2059 unsigned int refs = tctx->cached_refs;
2060
2061 if (refs) {
2062 tctx->cached_refs = 0;
2063 percpu_counter_sub(&tctx->inflight, refs);
2064 put_task_struct_many(task, refs);
2065 }
2066}
2067
d4d19c19 2068static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 2069 s32 res, u32 cflags)
2b188cc1 2070{
cce4b8b0 2071 struct io_overflow_cqe *ocqe;
2b188cc1 2072
cce4b8b0
PB
2073 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
2074 if (!ocqe) {
2075 /*
2076 * If we're in ring overflow flush mode, or in task cancel mode,
2077 * or cannot allocate an overflow entry, then we need to drop it
2078 * on the floor.
2079 */
8f6ed49a 2080 io_account_cq_overflow(ctx);
cce4b8b0 2081 return false;
2b188cc1 2082 }
cce4b8b0 2083 if (list_empty(&ctx->cq_overflow_list)) {
5ed7a37d 2084 set_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
2085 WRITE_ONCE(ctx->rings->sq_flags,
2086 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
2087
cce4b8b0 2088 }
d4d19c19 2089 ocqe->cqe.user_data = user_data;
cce4b8b0
PB
2090 ocqe->cqe.res = res;
2091 ocqe->cqe.flags = cflags;
2092 list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
2093 return true;
2b188cc1
JA
2094}
2095
ae4da189 2096static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
913a571a 2097 s32 res, u32 cflags)
2b188cc1
JA
2098{
2099 struct io_uring_cqe *cqe;
2100
2101 /*
2102 * If we can't get a cq entry, userspace overflowed the
2103 * submission (by quite a lot). Increment the overflow count in
2104 * the ring.
2105 */
d068b506 2106 cqe = io_get_cqe(ctx);
1d7bb1d5 2107 if (likely(cqe)) {
d4d19c19 2108 WRITE_ONCE(cqe->user_data, user_data);
2b188cc1 2109 WRITE_ONCE(cqe->res, res);
bcda7baa 2110 WRITE_ONCE(cqe->flags, cflags);
8d13326e 2111 return true;
2b188cc1 2112 }
d4d19c19 2113 return io_cqring_event_overflow(ctx, user_data, res, cflags);
2b188cc1
JA
2114}
2115
90e7c35f
PB
2116static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
2117 struct io_kiocb *req)
2118{
2119 struct io_uring_cqe *cqe;
2120
2121 trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
2122 req->cqe.res, req->cqe.flags);
2123
2124 /*
2125 * If we can't get a cq entry, userspace overflowed the
2126 * submission (by quite a lot). Increment the overflow count in
2127 * the ring.
2128 */
2129 cqe = io_get_cqe(ctx);
2130 if (likely(cqe)) {
2131 memcpy(cqe, &req->cqe, sizeof(*cqe));
2132 return true;
2133 }
2134 return io_cqring_event_overflow(ctx, req->cqe.user_data,
2135 req->cqe.res, req->cqe.flags);
2136}
2137
ae4da189 2138static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
d5ec1dfa 2139{
cef216fc
PB
2140 trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
2141 return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
d5ec1dfa
SR
2142}
2143
913a571a
PB
2144static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
2145 s32 res, u32 cflags)
bcda7baa 2146{
913a571a 2147 ctx->cq_extra++;
502c87d6 2148 trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
ae4da189 2149 return __io_fill_cqe(ctx, user_data, res, cflags);
bcda7baa
JA
2150}
2151
a37fae8a
HX
2152static void __io_req_complete_post(struct io_kiocb *req, s32 res,
2153 u32 cflags)
2b188cc1 2154{
78e19bbe 2155 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 2156
04c76b41 2157 if (!(req->flags & REQ_F_CQE_SKIP))
ae4da189 2158 __io_fill_cqe_req(req, res, cflags);
c7dae4ba
JA
2159 /*
2160 * If we're the last reference to this request, add to our locked
2161 * free_list cache.
2162 */
de9b4cca 2163 if (req_ref_put_and_test(req)) {
da1a08c5 2164 if (req->flags & IO_REQ_LINK_FLAGS) {
0756a869 2165 if (req->flags & IO_DISARM_MASK)
7a612350
PB
2166 io_disarm_next(req);
2167 if (req->link) {
2168 io_req_task_queue(req->link);
2169 req->link = NULL;
2170 }
2171 }
ab409402 2172 io_req_put_rsrc(req, ctx);
8197b053
PB
2173 /*
2174 * Selected buffer deallocation in io_clean_op() assumes that
2175 * we don't hold ->completion_lock. Clean them here to avoid
2176 * deadlocks.
2177 */
2178 io_put_kbuf_comp(req);
c7dae4ba
JA
2179 io_dismantle_req(req);
2180 io_put_task(req->task, 1);
c2b6c6bc 2181 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
d0acdee2 2182 ctx->locked_free_nr++;
180f829f 2183 }
a37fae8a
HX
2184}
2185
2186static void io_req_complete_post(struct io_kiocb *req, s32 res,
2187 u32 cflags)
2188{
2189 struct io_ring_ctx *ctx = req->ctx;
2190
2191 spin_lock(&ctx->completion_lock);
2192 __io_req_complete_post(req, res, cflags);
7a612350 2193 io_commit_cqring(ctx);
79ebeaee 2194 spin_unlock(&ctx->completion_lock);
a3f34907 2195 io_cqring_ev_posted(ctx);
4e3d9ff9
JA
2196}
2197
54daa9b2
PB
2198static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
2199 u32 cflags)
229a7b63 2200{
cef216fc
PB
2201 req->cqe.res = res;
2202 req->cqe.flags = cflags;
e342c807 2203 req->flags |= REQ_F_COMPLETE_INLINE;
e1e16097
JA
2204}
2205
889fca73 2206static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
54daa9b2 2207 s32 res, u32 cflags)
bcda7baa 2208{
889fca73
PB
2209 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
2210 io_req_complete_state(req, res, cflags);
a38d68db 2211 else
c7dae4ba 2212 io_req_complete_post(req, res, cflags);
bcda7baa
JA
2213}
2214
54daa9b2 2215static inline void io_req_complete(struct io_kiocb *req, s32 res)
0ddf92e8 2216{
889fca73 2217 __io_req_complete(req, 0, res, 0);
0ddf92e8
JA
2218}
2219
54daa9b2 2220static void io_req_complete_failed(struct io_kiocb *req, s32 res)
f41db273 2221{
93d2bcd2 2222 req_set_fail(req);
ab0ac095 2223 io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
f41db273
PB
2224}
2225
864ea921
PB
2226/*
2227 * Don't initialise the fields below on every allocation, but do that in
2228 * advance and keep them valid across allocations.
2229 */
2230static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
2231{
2232 req->ctx = ctx;
2233 req->link = NULL;
2234 req->async_data = NULL;
2235 /* not necessary, but safer to zero */
cef216fc 2236 req->cqe.res = 0;
864ea921
PB
2237}
2238
dac7a098 2239static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
cd0ca2e0 2240 struct io_submit_state *state)
dac7a098 2241{
79ebeaee 2242 spin_lock(&ctx->completion_lock);
c2b6c6bc 2243 wq_list_splice(&ctx->locked_free_list, &state->free_list);
d0acdee2 2244 ctx->locked_free_nr = 0;
79ebeaee 2245 spin_unlock(&ctx->completion_lock);
dac7a098
PB
2246}
2247
88ab95be
PB
2248static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
2249{
2250 return !ctx->submit_state.free_list.next;
2251}
2252
5d5901a3
PB
2253/*
2254 * A request might get retired back into the request caches even before opcode
2255 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
2256 * Because of that, io_alloc_req() should be called only under ->uring_lock
2257 * and with extra caution to not get a request that is still worked on.
2258 */
c072481d 2259static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
5d5901a3 2260 __must_hold(&ctx->uring_lock)
2b188cc1 2261{
864ea921 2262 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
3ab665b7 2263 void *reqs[IO_REQ_ALLOC_BATCH];
864ea921 2264 int ret, i;
e5d1bc0a 2265
23a5c43b
PB
2266 /*
2267 * If we have more than a batch's worth of requests in our IRQ side
2268 * locked cache, grab the lock and move them over to our submission
2269 * side cache.
2270 */
2271 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) {
2272 io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
88ab95be 2273 if (!io_req_cache_empty(ctx))
23a5c43b
PB
2274 return true;
2275 }
e5d1bc0a 2276
3ab665b7 2277 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
fd6fab2c 2278
864ea921
PB
2279 /*
2280 * Bulk alloc is all-or-nothing. If we fail to get a batch,
2281 * retry single alloc to be on the safe side.
2282 */
2283 if (unlikely(ret <= 0)) {
3ab665b7
PB
2284 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2285 if (!reqs[0])
a33ae9ce 2286 return false;
864ea921 2287 ret = 1;
2b188cc1 2288 }
864ea921 2289
37f0e767 2290 percpu_ref_get_many(&ctx->refs, ret);
3ab665b7 2291 for (i = 0; i < ret; i++) {
23a5c43b 2292 struct io_kiocb *req = reqs[i];
3ab665b7
PB
2293
2294 io_preinit_req(req, ctx);
fa05457a 2295 io_req_add_to_cache(req, ctx);
3ab665b7 2296 }
a33ae9ce
PB
2297 return true;
2298}
2299
2300static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
2301{
88ab95be 2302 if (unlikely(io_req_cache_empty(ctx)))
a33ae9ce
PB
2303 return __io_alloc_req_refill(ctx);
2304 return true;
2305}
2306
2307static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2308{
2309 struct io_wq_work_node *node;
2310
2311 node = wq_stack_extract(&ctx->submit_state.free_list);
c2b6c6bc 2312 return container_of(node, struct io_kiocb, comp_list);
2b188cc1
JA
2313}
2314
e1d767f0 2315static inline void io_put_file(struct file *file)
8da11c19 2316{
e1d767f0 2317 if (file)
8da11c19
PB
2318 fput(file);
2319}
2320
6b639522 2321static inline void io_dismantle_req(struct io_kiocb *req)
2b188cc1 2322{
094bae49 2323 unsigned int flags = req->flags;
929a3af9 2324
867f8fa5 2325 if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
3a0a6902 2326 io_clean_op(req);
e1d767f0
PB
2327 if (!(flags & REQ_F_FIXED_FILE))
2328 io_put_file(req->file);
e65ef56d
JA
2329}
2330
f5c6cf2a 2331static __cold void io_free_req(struct io_kiocb *req)
c6ca97b3 2332{
51a4cc11 2333 struct io_ring_ctx *ctx = req->ctx;
c6ca97b3 2334
ab409402 2335 io_req_put_rsrc(req, ctx);
216578e5 2336 io_dismantle_req(req);
7c660731 2337 io_put_task(req->task, 1);
c6ca97b3 2338
79ebeaee 2339 spin_lock(&ctx->completion_lock);
c2b6c6bc 2340 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
c34b025f 2341 ctx->locked_free_nr++;
79ebeaee 2342 spin_unlock(&ctx->completion_lock);
e65ef56d
JA
2343}
2344
f2f87370
PB
2345static inline void io_remove_next_linked(struct io_kiocb *req)
2346{
2347 struct io_kiocb *nxt = req->link;
2348
2349 req->link = nxt->link;
2350 nxt->link = NULL;
2351}
2352
33cc89a9
PB
2353static bool io_kill_linked_timeout(struct io_kiocb *req)
2354 __must_hold(&req->ctx->completion_lock)
89b263f6 2355 __must_hold(&req->ctx->timeout_lock)
2665abfd 2356{
33cc89a9 2357 struct io_kiocb *link = req->link;
f2f87370 2358
b97e736a 2359 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
c9abd7ad 2360 struct io_timeout_data *io = link->async_data;
7c86ffee 2361
f2f87370 2362 io_remove_next_linked(req);
90cd7e42 2363 link->timeout.head = NULL;
fd9c7bc5 2364 if (hrtimer_try_to_cancel(&io->timer) != -1) {
ef9dd637 2365 list_del(&link->timeout.list);
4e118cd9 2366 io_req_tw_post_queue(link, -ECANCELED, 0);
d4729fbd 2367 return true;
c9abd7ad
PB
2368 }
2369 }
d4729fbd 2370 return false;
7c86ffee
PB
2371}
2372
d148ca4b 2373static void io_fail_links(struct io_kiocb *req)
33cc89a9 2374 __must_hold(&req->ctx->completion_lock)
9e645e11 2375{
33cc89a9 2376 struct io_kiocb *nxt, *link = req->link;
04c76b41 2377 bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
9e645e11 2378
f2f87370 2379 req->link = NULL;
f2f87370 2380 while (link) {
a8295b98
HX
2381 long res = -ECANCELED;
2382
2383 if (link->flags & REQ_F_FAIL)
cef216fc 2384 res = link->cqe.res;
a8295b98 2385
f2f87370
PB
2386 nxt = link->link;
2387 link->link = NULL;
2665abfd 2388
cef216fc 2389 trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
502c87d6
SR
2390 req->opcode, link);
2391
4e118cd9
PB
2392 if (ignore_cqes)
2393 link->flags |= REQ_F_CQE_SKIP;
2394 else
04c76b41 2395 link->flags &= ~REQ_F_CQE_SKIP;
4e118cd9 2396 __io_req_complete_post(link, res, 0);
f2f87370 2397 link = nxt;
9e645e11 2398 }
33cc89a9 2399}
9e645e11 2400
33cc89a9
PB
2401static bool io_disarm_next(struct io_kiocb *req)
2402 __must_hold(&req->ctx->completion_lock)
2403{
2404 bool posted = false;
2405
0756a869
PB
2406 if (req->flags & REQ_F_ARM_LTIMEOUT) {
2407 struct io_kiocb *link = req->link;
2408
906c6caa 2409 req->flags &= ~REQ_F_ARM_LTIMEOUT;
0756a869
PB
2410 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2411 io_remove_next_linked(req);
4e118cd9 2412 io_req_tw_post_queue(link, -ECANCELED, 0);
0756a869
PB
2413 posted = true;
2414 }
2415 } else if (req->flags & REQ_F_LINK_TIMEOUT) {
89b263f6
JA
2416 struct io_ring_ctx *ctx = req->ctx;
2417
2418 spin_lock_irq(&ctx->timeout_lock);
33cc89a9 2419 posted = io_kill_linked_timeout(req);
89b263f6
JA
2420 spin_unlock_irq(&ctx->timeout_lock);
2421 }
93d2bcd2 2422 if (unlikely((req->flags & REQ_F_FAIL) &&
e4335ed3 2423 !(req->flags & REQ_F_HARDLINK))) {
33cc89a9
PB
2424 posted |= (req->link != NULL);
2425 io_fail_links(req);
2426 }
2427 return posted;
9e645e11
JA
2428}
2429
d81499bf
PB
2430static void __io_req_find_next_prep(struct io_kiocb *req)
2431{
2432 struct io_ring_ctx *ctx = req->ctx;
2433 bool posted;
2434
2435 spin_lock(&ctx->completion_lock);
2436 posted = io_disarm_next(req);
60053be8 2437 io_commit_cqring(ctx);
d81499bf
PB
2438 spin_unlock(&ctx->completion_lock);
2439 if (posted)
2440 io_cqring_ev_posted(ctx);
2441}
2442
2443static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
c69f8dbe 2444{
33cc89a9 2445 struct io_kiocb *nxt;
944e58bf 2446
9e645e11
JA
2447 /*
2448 * If LINK is set, we have dependent requests in this chain. If we
2449 * didn't fail this request, queue the first one up, moving any other
2450 * dependencies to the next request. In case of failure, fail the rest
2451 * of the chain.
2452 */
d81499bf
PB
2453 if (unlikely(req->flags & IO_DISARM_MASK))
2454 __io_req_find_next_prep(req);
33cc89a9
PB
2455 nxt = req->link;
2456 req->link = NULL;
2457 return nxt;
4d7dd462 2458}
9e645e11 2459
f237c30a 2460static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2c32395d
PB
2461{
2462 if (!ctx)
2463 return;
f237c30a 2464 if (*locked) {
c450178d 2465 io_submit_flush_completions(ctx);
2c32395d 2466 mutex_unlock(&ctx->uring_lock);
f237c30a 2467 *locked = false;
2c32395d
PB
2468 }
2469 percpu_ref_put(&ctx->refs);
2470}
2471
f28c240e
HX
2472static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
2473{
2474 io_commit_cqring(ctx);
2475 spin_unlock(&ctx->completion_lock);
2476 io_cqring_ev_posted(ctx);
2477}
2478
2479static void handle_prev_tw_list(struct io_wq_work_node *node,
2480 struct io_ring_ctx **ctx, bool *uring_locked)
2481{
2482 if (*ctx && !*uring_locked)
2483 spin_lock(&(*ctx)->completion_lock);
2484
2485 do {
2486 struct io_wq_work_node *next = node->next;
2487 struct io_kiocb *req = container_of(node, struct io_kiocb,
2488 io_task_work.node);
2489
34d2bfe7
JA
2490 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2491
f28c240e
HX
2492 if (req->ctx != *ctx) {
2493 if (unlikely(!*uring_locked && *ctx))
2494 ctx_commit_and_unlock(*ctx);
2495
2496 ctx_flush_and_put(*ctx, uring_locked);
2497 *ctx = req->ctx;
2498 /* if not contended, grab and improve batching */
2499 *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
2500 percpu_ref_get(&(*ctx)->refs);
2501 if (unlikely(!*uring_locked))
2502 spin_lock(&(*ctx)->completion_lock);
2503 }
2504 if (likely(*uring_locked))
2505 req->io_task_work.func(req, uring_locked);
2506 else
cef216fc 2507 __io_req_complete_post(req, req->cqe.res,
cc3cec83 2508 io_put_kbuf_comp(req));
f28c240e
HX
2509 node = next;
2510 } while (node);
2511
2512 if (unlikely(!*uring_locked))
2513 ctx_commit_and_unlock(*ctx);
2514}
2515
2516static void handle_tw_list(struct io_wq_work_node *node,
2517 struct io_ring_ctx **ctx, bool *locked)
9f8d032a
HX
2518{
2519 do {
2520 struct io_wq_work_node *next = node->next;
2521 struct io_kiocb *req = container_of(node, struct io_kiocb,
2522 io_task_work.node);
2523
34d2bfe7
JA
2524 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2525
9f8d032a
HX
2526 if (req->ctx != *ctx) {
2527 ctx_flush_and_put(*ctx, locked);
2528 *ctx = req->ctx;
2529 /* if not contended, grab and improve batching */
2530 *locked = mutex_trylock(&(*ctx)->uring_lock);
2531 percpu_ref_get(&(*ctx)->refs);
2532 }
2533 req->io_task_work.func(req, locked);
2534 node = next;
2535 } while (node);
2536}
2537
7cbf1722 2538static void tctx_task_work(struct callback_head *cb)
c40f6379 2539{
f28c240e 2540 bool uring_locked = false;
ebd0df2e 2541 struct io_ring_ctx *ctx = NULL;
3f18407d
PB
2542 struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2543 task_work);
c40f6379 2544
16f72070 2545 while (1) {
f28c240e 2546 struct io_wq_work_node *node1, *node2;
3f18407d
PB
2547
2548 spin_lock_irq(&tctx->task_lock);
f28c240e
HX
2549 node1 = tctx->prior_task_list.first;
2550 node2 = tctx->task_list.first;
3f18407d 2551 INIT_WQ_LIST(&tctx->task_list);
f28c240e
HX
2552 INIT_WQ_LIST(&tctx->prior_task_list);
2553 if (!node2 && !node1)
6294f368 2554 tctx->task_running = false;
3f18407d 2555 spin_unlock_irq(&tctx->task_lock);
f28c240e 2556 if (!node2 && !node1)
6294f368 2557 break;
3f18407d 2558
f28c240e
HX
2559 if (node1)
2560 handle_prev_tw_list(node1, &ctx, &uring_locked);
f28c240e
HX
2561 if (node2)
2562 handle_tw_list(node2, &ctx, &uring_locked);
7cbf1722 2563 cond_resched();
68ca8fc0
PB
2564
2565 if (!tctx->task_list.first &&
2566 !tctx->prior_task_list.first && uring_locked)
2567 io_submit_flush_completions(ctx);
3f18407d 2568 }
ebd0df2e 2569
f28c240e 2570 ctx_flush_and_put(ctx, &uring_locked);
3cc7fdb9
PB
2571
2572 /* relaxed read is enough as only the task itself sets ->in_idle */
2573 if (unlikely(atomic_read(&tctx->in_idle)))
2574 io_uring_drop_tctx_refs(current);
7cbf1722
JA
2575}
2576
4813c377 2577static void io_req_task_work_add(struct io_kiocb *req, bool priority)
7cbf1722 2578{
c15b79de 2579 struct task_struct *tsk = req->task;
7cbf1722 2580 struct io_uring_task *tctx = tsk->io_uring;
c15b79de 2581 enum task_work_notify_mode notify;
e09ee510 2582 struct io_wq_work_node *node;
0b81e80c 2583 unsigned long flags;
6294f368 2584 bool running;
7cbf1722
JA
2585
2586 WARN_ON_ONCE(!tctx);
2587
d5361233
JA
2588 io_drop_inflight_file(req);
2589
0b81e80c 2590 spin_lock_irqsave(&tctx->task_lock, flags);
4813c377
HX
2591 if (priority)
2592 wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
2593 else
2594 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
6294f368
PB
2595 running = tctx->task_running;
2596 if (!running)
2597 tctx->task_running = true;
0b81e80c 2598 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
2599
2600 /* task_work already pending, we're done */
6294f368 2601 if (running)
e09ee510 2602 return;
7cbf1722 2603
c15b79de
PB
2604 /*
2605 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2606 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2607 * processing task_work. There's no reliable way to tell if TWA_RESUME
2608 * will do the job.
2609 */
2610 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
d97ec623
PB
2611 if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
2612 if (notify == TWA_NONE)
2613 wake_up_process(tsk);
e09ee510 2614 return;
c15b79de 2615 }
2215bed9 2616
0b81e80c 2617 spin_lock_irqsave(&tctx->task_lock, flags);
6294f368 2618 tctx->task_running = false;
4813c377 2619 node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
0b81e80c 2620 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722 2621
e09ee510
PB
2622 while (node) {
2623 req = container_of(node, struct io_kiocb, io_task_work.node);
2624 node = node->next;
2625 if (llist_add(&req->io_task_work.fallback_node,
2626 &req->ctx->fallback_llist))
2627 schedule_delayed_work(&req->ctx->fallback_work, 1);
2628 }
eab30c4d
PB
2629}
2630
4e118cd9
PB
2631static void io_req_tw_post(struct io_kiocb *req, bool *locked)
2632{
2633 io_req_complete_post(req, req->cqe.res, req->cqe.flags);
2634}
2635
2636static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
2637{
2638 req->cqe.res = res;
2639 req->cqe.flags = cflags;
2640 req->io_task_work.func = io_req_tw_post;
2641 io_req_task_work_add(req, false);
2642}
2643
f237c30a 2644static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
c40f6379 2645{
b18a1a45 2646 /* not needed for normal modes, but SQPOLL depends on it */
971cf9c1 2647 io_tw_lock(req->ctx, locked);
cef216fc 2648 io_req_complete_failed(req, req->cqe.res);
c40f6379
JA
2649}
2650
f237c30a 2651static void io_req_task_submit(struct io_kiocb *req, bool *locked)
c40f6379 2652{
971cf9c1 2653 io_tw_lock(req->ctx, locked);
316319e8 2654 /* req->task == current here, checking PF_EXITING is safe */
af066f31 2655 if (likely(!(req->task->flags & PF_EXITING)))
cbc2e203 2656 io_queue_sqe(req);
81b6d05c 2657 else
2593553a 2658 io_req_complete_failed(req, -EFAULT);
c40f6379
JA
2659}
2660
2c4b8eb6 2661static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
c40f6379 2662{
cef216fc 2663 req->cqe.res = ret;
5b0a6acc 2664 req->io_task_work.func = io_req_task_cancel;
4813c377 2665 io_req_task_work_add(req, false);
c40f6379
JA
2666}
2667
2c4b8eb6 2668static void io_req_task_queue(struct io_kiocb *req)
a3df7698 2669{
5b0a6acc 2670 req->io_task_work.func = io_req_task_submit;
4813c377 2671 io_req_task_work_add(req, false);
a3df7698
PB
2672}
2673
773af691
JA
2674static void io_req_task_queue_reissue(struct io_kiocb *req)
2675{
77955efb 2676 req->io_task_work.func = io_queue_iowq;
4813c377 2677 io_req_task_work_add(req, false);
773af691
JA
2678}
2679
57859f4d 2680static void io_queue_next(struct io_kiocb *req)
c69f8dbe 2681{
57859f4d 2682 struct io_kiocb *nxt = io_req_find_next(req);
944e58bf 2683
57859f4d
PB
2684 if (nxt)
2685 io_req_task_queue(nxt);
c69f8dbe
JL
2686}
2687
3aa83bfb 2688static void io_free_batch_list(struct io_ring_ctx *ctx,
1cce17ac 2689 struct io_wq_work_node *node)
3aa83bfb 2690 __must_hold(&ctx->uring_lock)
5af1d13e 2691{
d4b7a5ef 2692 struct task_struct *task = NULL;
37f0e767 2693 int task_refs = 0;
5af1d13e 2694
3aa83bfb
PB
2695 do {
2696 struct io_kiocb *req = container_of(node, struct io_kiocb,
2697 comp_list);
2d6500d4 2698
a538be5b
PB
2699 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
2700 if (req->flags & REQ_F_REFCOUNT) {
2701 node = req->comp_list.next;
2702 if (!req_ref_put_and_test(req))
2703 continue;
2704 }
b605a7fa
PB
2705 if ((req->flags & REQ_F_POLLED) && req->apoll) {
2706 struct async_poll *apoll = req->apoll;
2707
2708 if (apoll->double_poll)
2709 kfree(apoll->double_poll);
2710 list_add(&apoll->poll.wait.entry,
2711 &ctx->apoll_cache);
2712 req->flags &= ~REQ_F_POLLED;
2713 }
da1a08c5 2714 if (req->flags & IO_REQ_LINK_FLAGS)
57859f4d 2715 io_queue_next(req);
a538be5b
PB
2716 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
2717 io_clean_op(req);
c1e53a69 2718 }
a538be5b
PB
2719 if (!(req->flags & REQ_F_FIXED_FILE))
2720 io_put_file(req->file);
2d6500d4 2721
ab409402 2722 io_req_put_rsrc_locked(req, ctx);
5af1d13e 2723
d4b7a5ef
PB
2724 if (req->task != task) {
2725 if (task)
2726 io_put_task(task, task_refs);
2727 task = req->task;
2728 task_refs = 0;
2729 }
2730 task_refs++;
c1e53a69 2731 node = req->comp_list.next;
fa05457a 2732 io_req_add_to_cache(req, ctx);
3aa83bfb 2733 } while (node);
d4b7a5ef 2734
d4b7a5ef
PB
2735 if (task)
2736 io_put_task(task, task_refs);
7a743e22
PB
2737}
2738
c450178d 2739static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
a141dd89 2740 __must_hold(&ctx->uring_lock)
905c172f 2741{
6f33b0bc 2742 struct io_wq_work_node *node, *prev;
cd0ca2e0 2743 struct io_submit_state *state = &ctx->submit_state;
905c172f 2744
3d4aeb9f
PB
2745 if (state->flush_cqes) {
2746 spin_lock(&ctx->completion_lock);
2747 wq_list_for_each(node, prev, &state->compl_reqs) {
2748 struct io_kiocb *req = container_of(node, struct io_kiocb,
6f33b0bc 2749 comp_list);
5182ed2e 2750
3d4aeb9f 2751 if (!(req->flags & REQ_F_CQE_SKIP))
90e7c35f 2752 __io_fill_cqe_req_filled(ctx, req);
3d4aeb9f
PB
2753 }
2754
2755 io_commit_cqring(ctx);
2756 spin_unlock(&ctx->completion_lock);
2757 io_cqring_ev_posted(ctx);
2758 state->flush_cqes = false;
905c172f 2759 }
5182ed2e 2760
1cce17ac 2761 io_free_batch_list(ctx, state->compl_reqs.first);
6f33b0bc 2762 INIT_WQ_LIST(&state->compl_reqs);
7a743e22
PB
2763}
2764
ba816ad6
JA
2765/*
2766 * Drop reference to request, return next in chain (if there is one) if this
2767 * was the last reference to this request.
2768 */
0d85035a 2769static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
e65ef56d 2770{
9b5f7bd9
PB
2771 struct io_kiocb *nxt = NULL;
2772
de9b4cca 2773 if (req_ref_put_and_test(req)) {
da1a08c5 2774 if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
7819a1f6 2775 nxt = io_req_find_next(req);
f5c6cf2a 2776 io_free_req(req);
2a44f467 2777 }
9b5f7bd9 2778 return nxt;
2b188cc1
JA
2779}
2780
0d85035a 2781static inline void io_put_req(struct io_kiocb *req)
e65ef56d 2782{
f5c6cf2a
PB
2783 if (req_ref_put_and_test(req)) {
2784 io_queue_next(req);
e65ef56d 2785 io_free_req(req);
f5c6cf2a 2786 }
2b188cc1
JA
2787}
2788
6c503150 2789static unsigned io_cqring_events(struct io_ring_ctx *ctx)
a3a0e43f
JA
2790{
2791 /* See comment at the top of this file */
2792 smp_rmb();
e23de15f 2793 return __io_cqring_events(ctx);
a3a0e43f
JA
2794}
2795
fb5ccc98
PB
2796static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2797{
2798 struct io_rings *rings = ctx->rings;
2799
2800 /* make sure SQ entry isn't read before tail */
2801 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2802}
2803
4c6e277c
JA
2804static inline bool io_run_task_work(void)
2805{
7f62d40d 2806 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
4c6e277c 2807 __set_current_state(TASK_RUNNING);
7c5d8fa6
EB
2808 clear_notify_signal();
2809 if (task_work_pending(current))
2810 task_work_run();
4c6e277c
JA
2811 return true;
2812 }
2813
2814 return false;
bcda7baa
JA
2815}
2816
5ba3c874 2817static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
def596e9 2818{
5eef4e87 2819 struct io_wq_work_node *pos, *start, *prev;
d729cf9a 2820 unsigned int poll_flags = BLK_POLL_NOSLEEP;
b688f11e 2821 DEFINE_IO_COMP_BATCH(iob);
5ba3c874 2822 int nr_events = 0;
def596e9
JA
2823
2824 /*
2825 * Only spin for completions if we don't have multiple devices hanging
87a115fb 2826 * off our complete list.
def596e9 2827 */
87a115fb 2828 if (ctx->poll_multi_queue || force_nonspin)
ef99b2d3 2829 poll_flags |= BLK_POLL_ONESHOT;
def596e9 2830
5eef4e87
PB
2831 wq_list_for_each(pos, start, &ctx->iopoll_list) {
2832 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
9adbd45d 2833 struct kiocb *kiocb = &req->rw.kiocb;
a2416e1e 2834 int ret;
def596e9
JA
2835
2836 /*
581f9810
BM
2837 * Move completed and retryable entries to our local lists.
2838 * If we find a request that requires polling, break out
2839 * and complete those lists first, if we have entries there.
def596e9 2840 */
e3f721e6 2841 if (READ_ONCE(req->iopoll_completed))
def596e9
JA
2842 break;
2843
b688f11e 2844 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
a2416e1e
PB
2845 if (unlikely(ret < 0))
2846 return ret;
2847 else if (ret)
ef99b2d3 2848 poll_flags |= BLK_POLL_ONESHOT;
def596e9 2849
3aadc23e 2850 /* iopoll may have completed current req */
b688f11e
JA
2851 if (!rq_list_empty(iob.req_list) ||
2852 READ_ONCE(req->iopoll_completed))
e3f721e6 2853 break;
def596e9
JA
2854 }
2855
b688f11e
JA
2856 if (!rq_list_empty(iob.req_list))
2857 iob.complete(&iob);
5eef4e87
PB
2858 else if (!pos)
2859 return 0;
def596e9 2860
5eef4e87
PB
2861 prev = start;
2862 wq_list_for_each_resume(pos, prev) {
2863 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2864
b3fa03fd
PB
2865 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
2866 if (!smp_load_acquire(&req->iopoll_completed))
e3f721e6 2867 break;
c0713540 2868 nr_events++;
83a13a41
PB
2869 if (unlikely(req->flags & REQ_F_CQE_SKIP))
2870 continue;
cef216fc 2871 __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
e3f721e6 2872 }
def596e9 2873
f5ed3bcd
PB
2874 if (unlikely(!nr_events))
2875 return 0;
2876
2877 io_commit_cqring(ctx);
2878 io_cqring_ev_posted_iopoll(ctx);
1cce17ac 2879 pos = start ? start->next : ctx->iopoll_list.first;
5eef4e87 2880 wq_list_cut(&ctx->iopoll_list, prev, start);
1cce17ac 2881 io_free_batch_list(ctx, pos);
5ba3c874 2882 return nr_events;
def596e9
JA
2883}
2884
def596e9
JA
2885/*
2886 * We can't just wait for polled events to come to us, we have to actively
2887 * find and complete them.
2888 */
c072481d 2889static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
def596e9
JA
2890{
2891 if (!(ctx->flags & IORING_SETUP_IOPOLL))
2892 return;
2893
2894 mutex_lock(&ctx->uring_lock);
5eef4e87 2895 while (!wq_list_empty(&ctx->iopoll_list)) {
b2edc0a7 2896 /* let it sleep and repeat later if can't complete a request */
5ba3c874 2897 if (io_do_iopoll(ctx, true) == 0)
b2edc0a7 2898 break;
08f5439f
JA
2899 /*
2900 * Ensure we allow local-to-the-cpu processing to take place,
2901 * in this case we need to ensure that we reap all events.
3fcee5a6 2902 * Also let task_work, etc. to progress by releasing the mutex
08f5439f 2903 */
3fcee5a6
PB
2904 if (need_resched()) {
2905 mutex_unlock(&ctx->uring_lock);
2906 cond_resched();
2907 mutex_lock(&ctx->uring_lock);
2908 }
def596e9
JA
2909 }
2910 mutex_unlock(&ctx->uring_lock);
2911}
2912
7668b92a 2913static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
def596e9 2914{
7668b92a 2915 unsigned int nr_events = 0;
e9979b36 2916 int ret = 0;
500f9fba 2917
f39c8a5b
PB
2918 /*
2919 * Don't enter poll loop if we already have events pending.
2920 * If we do, we can potentially be spinning for commands that
2921 * already triggered a CQE (eg in error).
2922 */
5ed7a37d 2923 if (test_bit(0, &ctx->check_cq_overflow))
f39c8a5b
PB
2924 __io_cqring_overflow_flush(ctx, false);
2925 if (io_cqring_events(ctx))
d487b43c 2926 return 0;
def596e9 2927 do {
500f9fba
JA
2928 /*
2929 * If a submit got punted to a workqueue, we can have the
2930 * application entering polling for a command before it gets
2931 * issued. That app will hold the uring_lock for the duration
2932 * of the poll right here, so we need to take a breather every
2933 * now and then to ensure that the issue has a chance to add
2934 * the poll to the issued list. Otherwise we can spin here
2935 * forever, while the workqueue is stuck trying to acquire the
2936 * very same mutex.
2937 */
5eef4e87 2938 if (wq_list_empty(&ctx->iopoll_list)) {
8f487ef2
PB
2939 u32 tail = ctx->cached_cq_tail;
2940
500f9fba 2941 mutex_unlock(&ctx->uring_lock);
4c6e277c 2942 io_run_task_work();
500f9fba 2943 mutex_lock(&ctx->uring_lock);
def596e9 2944
8f487ef2
PB
2945 /* some requests don't go through iopoll_list */
2946 if (tail != ctx->cached_cq_tail ||
5eef4e87 2947 wq_list_empty(&ctx->iopoll_list))
e9979b36 2948 break;
500f9fba 2949 }
5ba3c874
PB
2950 ret = io_do_iopoll(ctx, !min);
2951 if (ret < 0)
2952 break;
2953 nr_events += ret;
2954 ret = 0;
2955 } while (nr_events < min && !need_resched());
d487b43c 2956
def596e9
JA
2957 return ret;
2958}
2959
491381ce 2960static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 2961{
491381ce
JA
2962 /*
2963 * Tell lockdep we inherited freeze protection from submission
2964 * thread.
2965 */
2966 if (req->flags & REQ_F_ISREG) {
1c98679d 2967 struct super_block *sb = file_inode(req->file)->i_sb;
2b188cc1 2968
1c98679d
PB
2969 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2970 sb_end_write(sb);
2b188cc1
JA
2971 }
2972}
2973
b63534c4 2974#ifdef CONFIG_BLOCK
dc2a6e9a 2975static bool io_resubmit_prep(struct io_kiocb *req)
b63534c4 2976{
ab454438 2977 struct io_async_rw *rw = req->async_data;
b63534c4 2978
d886e185 2979 if (!req_has_async_data(req))
ab454438 2980 return !io_req_prep_async(req);
538941e2 2981 iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
ab454438 2982 return true;
b63534c4 2983}
b63534c4 2984
3e6a0d3c 2985static bool io_rw_should_reissue(struct io_kiocb *req)
b63534c4 2986{
355afaeb 2987 umode_t mode = file_inode(req->file)->i_mode;
3e6a0d3c 2988 struct io_ring_ctx *ctx = req->ctx;
b63534c4 2989
355afaeb
JA
2990 if (!S_ISBLK(mode) && !S_ISREG(mode))
2991 return false;
3e6a0d3c
JA
2992 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2993 !(ctx->flags & IORING_SETUP_IOPOLL)))
b63534c4 2994 return false;
7c977a58
JA
2995 /*
2996 * If ref is dying, we might be running poll reap from the exit work.
2997 * Don't attempt to reissue from that path, just let it fail with
2998 * -EAGAIN.
2999 */
3e6a0d3c
JA
3000 if (percpu_ref_is_dying(&ctx->refs))
3001 return false;
ef046888
JA
3002 /*
3003 * Play it safe and assume not safe to re-import and reissue if we're
3004 * not in the original thread group (or in task context).
3005 */
3006 if (!same_thread_group(req->task, current) || !in_task())
3007 return false;
3e6a0d3c
JA
3008 return true;
3009}
e82ad485 3010#else
a1ff1e3f 3011static bool io_resubmit_prep(struct io_kiocb *req)
e82ad485
JA
3012{
3013 return false;
3014}
e82ad485 3015static bool io_rw_should_reissue(struct io_kiocb *req)
3e6a0d3c 3016{
b63534c4
JA
3017 return false;
3018}
3e6a0d3c 3019#endif
b63534c4 3020
8ef12efe 3021static bool __io_complete_rw_common(struct io_kiocb *req, long res)
a1d7c393 3022{
f63cf519 3023 if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
b65c128f 3024 kiocb_end_write(req);
f63cf519
JA
3025 fsnotify_modify(req->file);
3026 } else {
3027 fsnotify_access(req->file);
3028 }
cef216fc 3029 if (unlikely(res != req->cqe.res)) {
9532b99b
PB
3030 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
3031 io_rw_should_reissue(req)) {
3032 req->flags |= REQ_F_REISSUE;
8ef12efe 3033 return true;
9532b99b 3034 }
93d2bcd2 3035 req_set_fail(req);
cef216fc 3036 req->cqe.res = res;
9532b99b 3037 }
8ef12efe
JA
3038 return false;
3039}
3040
cc8e9ba7 3041static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
8ef12efe 3042{
cef216fc 3043 int res = req->cqe.res;
126180b9
PB
3044
3045 if (*locked) {
cc3cec83 3046 io_req_complete_state(req, res, io_put_kbuf(req, 0));
fff4e40e 3047 io_req_add_compl_list(req);
126180b9 3048 } else {
cc3cec83
JA
3049 io_req_complete_post(req, res,
3050 io_put_kbuf(req, IO_URING_F_UNLOCKED));
126180b9 3051 }
8ef12efe
JA
3052}
3053
00f6e68b 3054static void __io_complete_rw(struct io_kiocb *req, long res,
8ef12efe
JA
3055 unsigned int issue_flags)
3056{
3057 if (__io_complete_rw_common(req, res))
3058 return;
cef216fc 3059 __io_req_complete(req, issue_flags, req->cqe.res,
cc3cec83 3060 io_put_kbuf(req, issue_flags));
ba816ad6
JA
3061}
3062
6b19b766 3063static void io_complete_rw(struct kiocb *kiocb, long res)
ba816ad6 3064{
9adbd45d 3065 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6 3066
8ef12efe
JA
3067 if (__io_complete_rw_common(req, res))
3068 return;
cef216fc 3069 req->cqe.res = res;
8ef12efe 3070 req->io_task_work.func = io_req_task_complete;
f28c240e 3071 io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
2b188cc1
JA
3072}
3073
6b19b766 3074static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
def596e9 3075{
9adbd45d 3076 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 3077
491381ce
JA
3078 if (kiocb->ki_flags & IOCB_WRITE)
3079 kiocb_end_write(req);
cef216fc 3080 if (unlikely(res != req->cqe.res)) {
b66ceaf3
PB
3081 if (res == -EAGAIN && io_rw_should_reissue(req)) {
3082 req->flags |= REQ_F_REISSUE;
3083 return;
9532b99b 3084 }
cef216fc 3085 req->cqe.res = res;
8c130827 3086 }
bbde017a 3087
b3fa03fd
PB
3088 /* order with io_iopoll_complete() checking ->iopoll_completed */
3089 smp_store_release(&req->iopoll_completed, 1);
def596e9
JA
3090}
3091
3092/*
3093 * After the iocb has been issued, it's safe to be found on the poll list.
3094 * Adding the kiocb to the list AFTER submission ensures that we don't
f39c8a5b 3095 * find it from a io_do_iopoll() thread before the issuer is done
def596e9
JA
3096 * accessing the kiocb cookie.
3097 */
9882131c 3098static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
def596e9
JA
3099{
3100 struct io_ring_ctx *ctx = req->ctx;
3b44b371 3101 const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
cb3d8972
PB
3102
3103 /* workqueue context doesn't hold uring_lock, grab it now */
3b44b371 3104 if (unlikely(needs_lock))
cb3d8972 3105 mutex_lock(&ctx->uring_lock);
def596e9
JA
3106
3107 /*
3108 * Track whether we have multiple files in our lists. This will impact
3109 * how we do polling eventually, not spinning if we're on potentially
3110 * different devices.
3111 */
5eef4e87 3112 if (wq_list_empty(&ctx->iopoll_list)) {
915b3dde
HX
3113 ctx->poll_multi_queue = false;
3114 } else if (!ctx->poll_multi_queue) {
def596e9
JA
3115 struct io_kiocb *list_req;
3116
5eef4e87
PB
3117 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
3118 comp_list);
30da1b45 3119 if (list_req->file != req->file)
915b3dde 3120 ctx->poll_multi_queue = true;
def596e9
JA
3121 }
3122
3123 /*
3124 * For fast devices, IO may have already completed. If it has, add
3125 * it to the front so we find it first.
3126 */
65a6543d 3127 if (READ_ONCE(req->iopoll_completed))
5eef4e87 3128 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
def596e9 3129 else
5eef4e87 3130 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
bdcd3eab 3131
3b44b371 3132 if (unlikely(needs_lock)) {
cb3d8972
PB
3133 /*
3134 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
3135 * in sq thread task context or in io worker task context. If
3136 * current task context is sq thread, we don't need to check
3137 * whether should wake up sq thread.
3138 */
3139 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
3140 wq_has_sleeper(&ctx->sq_data->wait))
3141 wake_up(&ctx->sq_data->wait);
3142
3143 mutex_unlock(&ctx->uring_lock);
3144 }
def596e9
JA
3145}
3146
4503b767
JA
3147static bool io_bdev_nowait(struct block_device *bdev)
3148{
9ba0d0c8 3149 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
4503b767
JA
3150}
3151
2b188cc1
JA
3152/*
3153 * If we tracked the file through the SCM inflight mechanism, we could support
3154 * any file. For now, just ensure that anything potentially problematic is done
3155 * inline.
3156 */
88459b50 3157static bool __io_file_supports_nowait(struct file *file, umode_t mode)
2b188cc1 3158{
4503b767 3159 if (S_ISBLK(mode)) {
4e7b5671
CH
3160 if (IS_ENABLED(CONFIG_BLOCK) &&
3161 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
4503b767
JA
3162 return true;
3163 return false;
3164 }
976517f1 3165 if (S_ISSOCK(mode))
2b188cc1 3166 return true;
4503b767 3167 if (S_ISREG(mode)) {
4e7b5671
CH
3168 if (IS_ENABLED(CONFIG_BLOCK) &&
3169 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
4503b767
JA
3170 file->f_op != &io_uring_fops)
3171 return true;
3172 return false;
3173 }
2b188cc1 3174
c5b85625
JA
3175 /* any ->read/write should understand O_NONBLOCK */
3176 if (file->f_flags & O_NONBLOCK)
3177 return true;
35645ac3 3178 return file->f_mode & FMODE_NOWAIT;
2b188cc1 3179}
c5b85625 3180
88459b50
PB
3181/*
3182 * If we tracked the file through the SCM inflight mechanism, we could support
3183 * any file. For now, just ensure that anything potentially problematic is done
3184 * inline.
3185 */
3186static unsigned int io_file_get_flags(struct file *file)
3187{
3188 umode_t mode = file_inode(file)->i_mode;
3189 unsigned int res = 0;
af197f50 3190
88459b50
PB
3191 if (S_ISREG(mode))
3192 res |= FFS_ISREG;
3193 if (__io_file_supports_nowait(file, mode))
3194 res |= FFS_NOWAIT;
3195 return res;
2b188cc1
JA
3196}
3197
35645ac3 3198static inline bool io_file_supports_nowait(struct io_kiocb *req)
7b29f92d 3199{
88459b50 3200 return req->flags & REQ_F_SUPPORT_NOWAIT;
7b29f92d
JA
3201}
3202
b9a6b8f9 3203static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 3204{
9adbd45d 3205 struct kiocb *kiocb = &req->rw.kiocb;
09bb8394
JA
3206 unsigned ioprio;
3207 int ret;
2b188cc1 3208
2b188cc1 3209 kiocb->ki_pos = READ_ONCE(sqe->off);
9adbd45d 3210
fb27274a
PB
3211 ioprio = READ_ONCE(sqe->ioprio);
3212 if (ioprio) {
3213 ret = ioprio_check_cap(ioprio);
3214 if (ret)
3215 return ret;
3216
3217 kiocb->ki_ioprio = ioprio;
3218 } else {
3219 kiocb->ki_ioprio = get_current_ioprio();
eae071c9
PB
3220 }
3221
578c0ee2 3222 req->imu = NULL;
3529d8c2
JA
3223 req->rw.addr = READ_ONCE(sqe->addr);
3224 req->rw.len = READ_ONCE(sqe->len);
584b0180 3225 req->rw.flags = READ_ONCE(sqe->rw_flags);
4f4eeba8 3226 req->buf_index = READ_ONCE(sqe->buf_index);
2b188cc1 3227 return 0;
2b188cc1
JA
3228}
3229
3230static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
3231{
3232 switch (ret) {
3233 case -EIOCBQUEUED:
3234 break;
3235 case -ERESTARTSYS:
3236 case -ERESTARTNOINTR:
3237 case -ERESTARTNOHAND:
3238 case -ERESTART_RESTARTBLOCK:
3239 /*
3240 * We can't just restart the syscall, since previously
3241 * submitted sqes may already be in progress. Just fail this
3242 * IO with EINTR.
3243 */
3244 ret = -EINTR;
df561f66 3245 fallthrough;
2b188cc1 3246 default:
6b19b766 3247 kiocb->ki_complete(kiocb, ret);
2b188cc1
JA
3248 }
3249}
3250
b4aec400 3251static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
d34e1e5b
DY
3252{
3253 struct kiocb *kiocb = &req->rw.kiocb;
3254
6f83ab22
JA
3255 if (kiocb->ki_pos != -1)
3256 return &kiocb->ki_pos;
3257
3258 if (!(req->file->f_mode & FMODE_STREAM)) {
3259 req->flags |= REQ_F_CUR_POS;
3260 kiocb->ki_pos = req->file->f_pos;
3261 return &kiocb->ki_pos;
d34e1e5b 3262 }
6f83ab22
JA
3263
3264 kiocb->ki_pos = 0;
3265 return NULL;
d34e1e5b
DY
3266}
3267
2ea537ca 3268static void kiocb_done(struct io_kiocb *req, ssize_t ret,
889fca73 3269 unsigned int issue_flags)
ba816ad6 3270{
e8c2bc1f 3271 struct io_async_rw *io = req->async_data;
ba04291e 3272
227c0c96 3273 /* add previously done IO, if any */
d886e185 3274 if (req_has_async_data(req) && io->bytes_done > 0) {
227c0c96 3275 if (ret < 0)
e8c2bc1f 3276 ret = io->bytes_done;
227c0c96 3277 else
e8c2bc1f 3278 ret += io->bytes_done;
227c0c96
JA
3279 }
3280
ba04291e 3281 if (req->flags & REQ_F_CUR_POS)
2ea537ca
PB
3282 req->file->f_pos = req->rw.kiocb.ki_pos;
3283 if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
00f6e68b 3284 __io_complete_rw(req, ret, issue_flags);
ba816ad6 3285 else
2ea537ca 3286 io_rw_done(&req->rw.kiocb, ret);
97284637 3287
b66ceaf3 3288 if (req->flags & REQ_F_REISSUE) {
97284637 3289 req->flags &= ~REQ_F_REISSUE;
b91ef187 3290 if (io_resubmit_prep(req))
773af691 3291 io_req_task_queue_reissue(req);
b91ef187
PB
3292 else
3293 io_req_task_queue_fail(req, ret);
97284637 3294 }
ba816ad6
JA
3295}
3296
eae071c9
PB
3297static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3298 struct io_mapped_ubuf *imu)
edafccee 3299{
9adbd45d 3300 size_t len = req->rw.len;
75769e3f 3301 u64 buf_end, buf_addr = req->rw.addr;
edafccee 3302 size_t offset;
edafccee 3303
75769e3f 3304 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
edafccee
JA
3305 return -EFAULT;
3306 /* not inside the mapped region */
4751f53d 3307 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
edafccee
JA
3308 return -EFAULT;
3309
3310 /*
3311 * May not be a start of buffer, set size appropriately
3312 * and advance us to the beginning.
3313 */
3314 offset = buf_addr - imu->ubuf;
3315 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
3316
3317 if (offset) {
3318 /*
3319 * Don't use iov_iter_advance() here, as it's really slow for
3320 * using the latter parts of a big fixed buffer - it iterates
3321 * over each segment manually. We can cheat a bit here, because
3322 * we know that:
3323 *
3324 * 1) it's a BVEC iter, we set it up
3325 * 2) all bvecs are PAGE_SIZE in size, except potentially the
3326 * first and last bvec
3327 *
3328 * So just find our index, and adjust the iterator afterwards.
3329 * If the offset is within the first bvec (or the whole first
3330 * bvec, just use iov_iter_advance(). This makes it easier
3331 * since we can just skip the first segment, which may not
3332 * be PAGE_SIZE aligned.
3333 */
3334 const struct bio_vec *bvec = imu->bvec;
3335
3336 if (offset <= bvec->bv_len) {
3337 iov_iter_advance(iter, offset);
3338 } else {
3339 unsigned long seg_skip;
3340
3341 /* skip first vec */
3342 offset -= bvec->bv_len;
3343 seg_skip = 1 + (offset >> PAGE_SHIFT);
3344
3345 iter->bvec = bvec + seg_skip;
3346 iter->nr_segs -= seg_skip;
99c79f66 3347 iter->count -= bvec->bv_len + offset;
bd11b3a3 3348 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
3349 }
3350 }
3351
847595de 3352 return 0;
edafccee
JA
3353}
3354
5106dd6e
JA
3355static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3356 unsigned int issue_flags)
eae071c9 3357{
eae071c9
PB
3358 struct io_mapped_ubuf *imu = req->imu;
3359 u16 index, buf_index = req->buf_index;
3360
3361 if (likely(!imu)) {
578c0ee2
PB
3362 struct io_ring_ctx *ctx = req->ctx;
3363
eae071c9
PB
3364 if (unlikely(buf_index >= ctx->nr_user_bufs))
3365 return -EFAULT;
5106dd6e 3366 io_req_set_rsrc_node(req, ctx, issue_flags);
eae071c9
PB
3367 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3368 imu = READ_ONCE(ctx->user_bufs[index]);
3369 req->imu = imu;
3370 }
3371 return __io_import_fixed(req, rw, iter, imu);
3372}
3373
dbc7d452
JA
3374static void io_buffer_add_list(struct io_ring_ctx *ctx,
3375 struct io_buffer_list *bl, unsigned int bgid)
3376{
3377 struct list_head *list;
3378
3379 list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
3380 INIT_LIST_HEAD(&bl->buf_list);
3381 bl->bgid = bgid;
3382 list_add(&bl->list, list);
3383}
3384
bcda7baa 3385static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
51aac424 3386 int bgid, unsigned int issue_flags)
bcda7baa 3387{
30d51dd4 3388 struct io_buffer *kbuf = req->kbuf;
dbc7d452
JA
3389 struct io_ring_ctx *ctx = req->ctx;
3390 struct io_buffer_list *bl;
bcda7baa
JA
3391
3392 if (req->flags & REQ_F_BUFFER_SELECTED)
3393 return kbuf;
3394
f8929630 3395 io_ring_submit_lock(req->ctx, issue_flags);
bcda7baa 3396
dbc7d452
JA
3397 bl = io_buffer_get_list(ctx, bgid);
3398 if (bl && !list_empty(&bl->buf_list)) {
3399 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
3400 list_del(&kbuf->list);
bcda7baa
JA
3401 if (*len > kbuf->len)
3402 *len = kbuf->len;
30d51dd4
PB
3403 req->flags |= REQ_F_BUFFER_SELECTED;
3404 req->kbuf = kbuf;
bcda7baa
JA
3405 } else {
3406 kbuf = ERR_PTR(-ENOBUFS);
3407 }
3408
f8929630 3409 io_ring_submit_unlock(req->ctx, issue_flags);
bcda7baa
JA
3410 return kbuf;
3411}
3412
4d954c25 3413static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
51aac424 3414 unsigned int issue_flags)
4d954c25
JA
3415{
3416 struct io_buffer *kbuf;
4f4eeba8 3417 u16 bgid;
4d954c25 3418
4f4eeba8 3419 bgid = req->buf_index;
51aac424 3420 kbuf = io_buffer_select(req, len, bgid, issue_flags);
4d954c25
JA
3421 if (IS_ERR(kbuf))
3422 return kbuf;
4d954c25
JA
3423 return u64_to_user_ptr(kbuf->addr);
3424}
3425
3426#ifdef CONFIG_COMPAT
3427static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
51aac424 3428 unsigned int issue_flags)
4d954c25
JA
3429{
3430 struct compat_iovec __user *uiov;
3431 compat_ssize_t clen;
3432 void __user *buf;
3433 ssize_t len;
3434
3435 uiov = u64_to_user_ptr(req->rw.addr);
3436 if (!access_ok(uiov, sizeof(*uiov)))
3437 return -EFAULT;
3438 if (__get_user(clen, &uiov->iov_len))
3439 return -EFAULT;
3440 if (clen < 0)
3441 return -EINVAL;
3442
3443 len = clen;
51aac424 3444 buf = io_rw_buffer_select(req, &len, issue_flags);
4d954c25
JA
3445 if (IS_ERR(buf))
3446 return PTR_ERR(buf);
3447 iov[0].iov_base = buf;
3448 iov[0].iov_len = (compat_size_t) len;
3449 return 0;
3450}
3451#endif
3452
3453static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
51aac424 3454 unsigned int issue_flags)
4d954c25
JA
3455{
3456 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3457 void __user *buf;
3458 ssize_t len;
3459
3460 if (copy_from_user(iov, uiov, sizeof(*uiov)))
3461 return -EFAULT;
3462
3463 len = iov[0].iov_len;
3464 if (len < 0)
3465 return -EINVAL;
51aac424 3466 buf = io_rw_buffer_select(req, &len, issue_flags);
4d954c25
JA
3467 if (IS_ERR(buf))
3468 return PTR_ERR(buf);
3469 iov[0].iov_base = buf;
3470 iov[0].iov_len = len;
3471 return 0;
3472}
3473
3474static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
51aac424 3475 unsigned int issue_flags)
4d954c25 3476{
dddb3e26 3477 if (req->flags & REQ_F_BUFFER_SELECTED) {
30d51dd4 3478 struct io_buffer *kbuf = req->kbuf;
dddb3e26 3479
dddb3e26
JA
3480 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3481 iov[0].iov_len = kbuf->len;
4d954c25 3482 return 0;
dddb3e26 3483 }
dd201662 3484 if (req->rw.len != 1)
4d954c25
JA
3485 return -EINVAL;
3486
3487#ifdef CONFIG_COMPAT
3488 if (req->ctx->compat)
51aac424 3489 return io_compat_import(req, iov, issue_flags);
4d954c25
JA
3490#endif
3491
51aac424 3492 return __io_iov_buffer_select(req, iov, issue_flags);
4d954c25
JA
3493}
3494
caa8fe6e
PB
3495static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
3496 struct io_rw_state *s,
3497 unsigned int issue_flags)
2b188cc1 3498{
5e49c973 3499 struct iov_iter *iter = &s->iter;
847595de 3500 u8 opcode = req->opcode;
caa8fe6e 3501 struct iovec *iovec;
d1d681b0
PB
3502 void __user *buf;
3503 size_t sqe_len;
4d954c25 3504 ssize_t ret;
edafccee 3505
f3251183 3506 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
5106dd6e 3507 ret = io_import_fixed(req, rw, iter, issue_flags);
f3251183
PB
3508 if (ret)
3509 return ERR_PTR(ret);
3510 return NULL;
3511 }
2b188cc1 3512
bcda7baa 3513 /* buffer index only valid with fixed read/write, or buffer select */
d1d681b0 3514 if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
caa8fe6e 3515 return ERR_PTR(-EINVAL);
9adbd45d 3516
d1d681b0
PB
3517 buf = u64_to_user_ptr(req->rw.addr);
3518 sqe_len = req->rw.len;
9adbd45d 3519
3a6820f2 3520 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
bcda7baa 3521 if (req->flags & REQ_F_BUFFER_SELECT) {
51aac424 3522 buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
867a23ea 3523 if (IS_ERR(buf))
898df244 3524 return ERR_CAST(buf);
3f9d6441 3525 req->rw.len = sqe_len;
bcda7baa
JA
3526 }
3527
5e49c973 3528 ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
f3251183
PB
3529 if (ret)
3530 return ERR_PTR(ret);
3531 return NULL;
3a6820f2
JA
3532 }
3533
caa8fe6e 3534 iovec = s->fast_iov;
4d954c25 3535 if (req->flags & REQ_F_BUFFER_SELECT) {
caa8fe6e 3536 ret = io_iov_buffer_select(req, iovec, issue_flags);
f3251183
PB
3537 if (ret)
3538 return ERR_PTR(ret);
3539 iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
3540 return NULL;
4d954c25
JA
3541 }
3542
caa8fe6e 3543 ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
89cd35c5 3544 req->ctx->compat);
caa8fe6e
PB
3545 if (unlikely(ret < 0))
3546 return ERR_PTR(ret);
3547 return iovec;
2b188cc1
JA
3548}
3549
5e49c973
PB
3550static inline int io_import_iovec(int rw, struct io_kiocb *req,
3551 struct iovec **iovec, struct io_rw_state *s,
3552 unsigned int issue_flags)
3553{
caa8fe6e
PB
3554 *iovec = __io_import_iovec(rw, req, s, issue_flags);
3555 if (unlikely(IS_ERR(*iovec)))
3556 return PTR_ERR(*iovec);
5e49c973 3557
5e49c973 3558 iov_iter_save_state(&s->iter, &s->iter_state);
caa8fe6e 3559 return 0;
2b188cc1
JA
3560}
3561
0fef9483
JA
3562static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3563{
5b09e37e 3564 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
0fef9483
JA
3565}
3566
31b51510 3567/*
32960613
JA
3568 * For files that don't have ->read_iter() and ->write_iter(), handle them
3569 * by looping over ->read() or ->write() manually.
31b51510 3570 */
4017eb91 3571static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
32960613 3572{
4017eb91
JA
3573 struct kiocb *kiocb = &req->rw.kiocb;
3574 struct file *file = req->file;
32960613 3575 ssize_t ret = 0;
af9c45ec 3576 loff_t *ppos;
32960613
JA
3577
3578 /*
3579 * Don't support polled IO through this interface, and we can't
3580 * support non-blocking either. For the latter, this just causes
3581 * the kiocb to be handled from an async context.
3582 */
3583 if (kiocb->ki_flags & IOCB_HIPRI)
3584 return -EOPNOTSUPP;
35645ac3
PB
3585 if ((kiocb->ki_flags & IOCB_NOWAIT) &&
3586 !(kiocb->ki_filp->f_flags & O_NONBLOCK))
32960613
JA
3587 return -EAGAIN;
3588
af9c45ec
DY
3589 ppos = io_kiocb_ppos(kiocb);
3590
32960613 3591 while (iov_iter_count(iter)) {
311ae9e1 3592 struct iovec iovec;
32960613
JA
3593 ssize_t nr;
3594
311ae9e1
PB
3595 if (!iov_iter_is_bvec(iter)) {
3596 iovec = iov_iter_iovec(iter);
3597 } else {
4017eb91
JA
3598 iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3599 iovec.iov_len = req->rw.len;
311ae9e1
PB
3600 }
3601
32960613
JA
3602 if (rw == READ) {
3603 nr = file->f_op->read(file, iovec.iov_base,
af9c45ec 3604 iovec.iov_len, ppos);
32960613
JA
3605 } else {
3606 nr = file->f_op->write(file, iovec.iov_base,
af9c45ec 3607 iovec.iov_len, ppos);
32960613
JA
3608 }
3609
3610 if (nr < 0) {
3611 if (!ret)
3612 ret = nr;
3613 break;
3614 }
5e929367 3615 ret += nr;
16c8d2df
JA
3616 if (!iov_iter_is_bvec(iter)) {
3617 iov_iter_advance(iter, nr);
3618 } else {
16c8d2df 3619 req->rw.addr += nr;
5e929367
JA
3620 req->rw.len -= nr;
3621 if (!req->rw.len)
3622 break;
16c8d2df 3623 }
32960613
JA
3624 if (nr != iovec.iov_len)
3625 break;
32960613
JA
3626 }
3627
3628 return ret;
3629}
3630
ff6165b2
JA
3631static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3632 const struct iovec *fast_iov, struct iov_iter *iter)
f67676d1 3633{
e8c2bc1f 3634 struct io_async_rw *rw = req->async_data;
b64e3444 3635
538941e2 3636 memcpy(&rw->s.iter, iter, sizeof(*iter));
afb87658 3637 rw->free_iovec = iovec;
227c0c96 3638 rw->bytes_done = 0;
ff6165b2 3639 /* can only be fixed buffers, no need to do anything */
9c3a205c 3640 if (iov_iter_is_bvec(iter))
ff6165b2 3641 return;
b64e3444 3642 if (!iovec) {
ff6165b2
JA
3643 unsigned iov_off = 0;
3644
538941e2 3645 rw->s.iter.iov = rw->s.fast_iov;
ff6165b2
JA
3646 if (iter->iov != fast_iov) {
3647 iov_off = iter->iov - fast_iov;
538941e2 3648 rw->s.iter.iov += iov_off;
ff6165b2 3649 }
538941e2
PB
3650 if (rw->s.fast_iov != fast_iov)
3651 memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
45097dae 3652 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
3653 } else {
3654 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
3655 }
3656}
3657
8d4af685 3658static inline bool io_alloc_async_data(struct io_kiocb *req)
3d9932a8 3659{
e8c2bc1f
JA
3660 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3661 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
d886e185
PB
3662 if (req->async_data) {
3663 req->flags |= REQ_F_ASYNC_DATA;
3664 return false;
3665 }
3666 return true;
3d9932a8
XW
3667}
3668
ff6165b2 3669static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
c88598a9 3670 struct io_rw_state *s, bool force)
b7bb4f7d 3671{
26f0505a 3672 if (!force && !io_op_defs[req->opcode].needs_async_setup)
74566df3 3673 return 0;
d886e185 3674 if (!req_has_async_data(req)) {
cd658695
JA
3675 struct io_async_rw *iorw;
3676
6cb78689 3677 if (io_alloc_async_data(req)) {
6bf985dc 3678 kfree(iovec);
5d204bcf 3679 return -ENOMEM;
6bf985dc 3680 }
b7bb4f7d 3681
c88598a9 3682 io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
cd658695
JA
3683 iorw = req->async_data;
3684 /* we've copied and mapped the iter, ensure state is saved */
538941e2 3685 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
5d204bcf 3686 }
b7bb4f7d 3687 return 0;
f67676d1
JA
3688}
3689
73debe68 3690static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
c3e330a4 3691{
e8c2bc1f 3692 struct io_async_rw *iorw = req->async_data;
5e49c973 3693 struct iovec *iov;
847595de 3694 int ret;
c3e330a4 3695
51aac424 3696 /* submission path, ->uring_lock should already be taken */
3b44b371 3697 ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
c3e330a4
PB
3698 if (unlikely(ret < 0))
3699 return ret;
3700
ab0b196c
PB
3701 iorw->bytes_done = 0;
3702 iorw->free_iovec = iov;
3703 if (iov)
3704 req->flags |= REQ_F_NEED_CLEANUP;
c3e330a4
PB
3705 return 0;
3706}
3707
c1dd91d1 3708/*
ffdc8dab 3709 * This is our waitqueue callback handler, registered through __folio_lock_async()
c1dd91d1
JA
3710 * when we initially tried to do the IO with the iocb armed our waitqueue.
3711 * This gets called when the page is unlocked, and we generally expect that to
3712 * happen when the page IO is completed and the page is now uptodate. This will
3713 * queue a task_work based retry of the operation, attempting to copy the data
3714 * again. If the latter fails because the page was NOT uptodate, then we will
3715 * do a thread based blocking retry of the operation. That's the unexpected
3716 * slow path.
3717 */
bcf5a063
JA
3718static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3719 int sync, void *arg)
3720{
3721 struct wait_page_queue *wpq;
3722 struct io_kiocb *req = wait->private;
bcf5a063 3723 struct wait_page_key *key = arg;
bcf5a063
JA
3724
3725 wpq = container_of(wait, struct wait_page_queue, wait);
3726
cdc8fcb4
LT
3727 if (!wake_page_match(wpq, key))
3728 return 0;
3729
c8d317aa 3730 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
bcf5a063 3731 list_del_init(&wait->entry);
921b9054 3732 io_req_task_queue(req);
bcf5a063
JA
3733 return 1;
3734}
3735
c1dd91d1
JA
3736/*
3737 * This controls whether a given IO request should be armed for async page
3738 * based retry. If we return false here, the request is handed to the async
3739 * worker threads for retry. If we're doing buffered reads on a regular file,
3740 * we prepare a private wait_page_queue entry and retry the operation. This
3741 * will either succeed because the page is now uptodate and unlocked, or it
3742 * will register a callback when the page is unlocked at IO completion. Through
3743 * that callback, io_uring uses task_work to setup a retry of the operation.
3744 * That retry will attempt the buffered read again. The retry will generally
3745 * succeed, or in rare cases where it fails, we then fall back to using the
3746 * async worker threads for a blocking retry.
3747 */
227c0c96 3748static bool io_rw_should_retry(struct io_kiocb *req)
f67676d1 3749{
e8c2bc1f
JA
3750 struct io_async_rw *rw = req->async_data;
3751 struct wait_page_queue *wait = &rw->wpq;
bcf5a063 3752 struct kiocb *kiocb = &req->rw.kiocb;
f67676d1 3753
bcf5a063
JA
3754 /* never retry for NOWAIT, we just complete with -EAGAIN */
3755 if (req->flags & REQ_F_NOWAIT)
3756 return false;
f67676d1 3757
227c0c96 3758 /* Only for buffered IO */
3b2a4439 3759 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
bcf5a063 3760 return false;
3b2a4439 3761
bcf5a063
JA
3762 /*
3763 * just use poll if we can, and don't attempt if the fs doesn't
3764 * support callback based unlocks
3765 */
3766 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3767 return false;
f67676d1 3768
3b2a4439
JA
3769 wait->wait.func = io_async_buf_func;
3770 wait->wait.private = req;
3771 wait->wait.flags = 0;
3772 INIT_LIST_HEAD(&wait->wait.entry);
3773 kiocb->ki_flags |= IOCB_WAITQ;
c8d317aa 3774 kiocb->ki_flags &= ~IOCB_NOWAIT;
3b2a4439 3775 kiocb->ki_waitq = wait;
3b2a4439 3776 return true;
bcf5a063
JA
3777}
3778
aeab9506 3779static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
bcf5a063 3780{
607b6fb8 3781 if (likely(req->file->f_op->read_iter))
bcf5a063 3782 return call_read_iter(req->file, &req->rw.kiocb, iter);
2dd2111d 3783 else if (req->file->f_op->read)
4017eb91 3784 return loop_rw_iter(READ, req, iter);
2dd2111d
GH
3785 else
3786 return -EINVAL;
f67676d1
JA
3787}
3788
7db30437
ML
3789static bool need_read_all(struct io_kiocb *req)
3790{
3791 return req->flags & REQ_F_ISREG ||
3792 S_ISBLK(file_inode(req->file)->i_mode);
3793}
3794
584b0180
JA
3795static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
3796{
3797 struct kiocb *kiocb = &req->rw.kiocb;
3798 struct io_ring_ctx *ctx = req->ctx;
3799 struct file *file = req->file;
3800 int ret;
3801
3802 if (unlikely(!file || !(file->f_mode & mode)))
3803 return -EBADF;
3804
3805 if (!io_req_ffs_set(req))
3806 req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
3807
3808 kiocb->ki_flags = iocb_flags(file);
3809 ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
3810 if (unlikely(ret))
3811 return ret;
3812
3813 /*
3814 * If the file is marked O_NONBLOCK, still allow retry for it if it
3815 * supports async. Otherwise it's impossible to use O_NONBLOCK files
3816 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
3817 */
3818 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
3819 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
3820 req->flags |= REQ_F_NOWAIT;
3821
3822 if (ctx->flags & IORING_SETUP_IOPOLL) {
3823 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
3824 return -EOPNOTSUPP;
3825
3826 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
3827 kiocb->ki_complete = io_complete_rw_iopoll;
3828 req->iopoll_completed = 0;
3829 } else {
3830 if (kiocb->ki_flags & IOCB_HIPRI)
3831 return -EINVAL;
3832 kiocb->ki_complete = io_complete_rw;
3833 }
3834
3835 return 0;
3836}
3837
889fca73 3838static int io_read(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 3839{
607b6fb8 3840 struct io_rw_state __s, *s = &__s;
c88598a9 3841 struct iovec *iovec;
9adbd45d 3842 struct kiocb *kiocb = &req->rw.kiocb;
45d189c6 3843 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
d886e185 3844 struct io_async_rw *rw;
cd658695 3845 ssize_t ret, ret2;
b4aec400 3846 loff_t *ppos;
ff6165b2 3847
607b6fb8
PB
3848 if (!req_has_async_data(req)) {
3849 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3850 if (unlikely(ret < 0))
3851 return ret;
3852 } else {
2be2eb02
JA
3853 /*
3854 * Safe and required to re-import if we're using provided
3855 * buffers, as we dropped the selected one before retry.
3856 */
3857 if (req->flags & REQ_F_BUFFER_SELECT) {
3858 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3859 if (unlikely(ret < 0))
3860 return ret;
3861 }
3862
d886e185 3863 rw = req->async_data;
c88598a9 3864 s = &rw->s;
cd658695
JA
3865 /*
3866 * We come here from an earlier attempt, restore our state to
3867 * match in case it doesn't. It's cheap enough that we don't
3868 * need to make this conditional.
3869 */
c88598a9 3870 iov_iter_restore(&s->iter, &s->iter_state);
2846c481 3871 iovec = NULL;
2846c481 3872 }
584b0180 3873 ret = io_rw_init_file(req, FMODE_READ);
323b190b
JA
3874 if (unlikely(ret)) {
3875 kfree(iovec);
584b0180 3876 return ret;
323b190b 3877 }
cef216fc 3878 req->cqe.res = iov_iter_count(&s->iter);
2b188cc1 3879
607b6fb8
PB
3880 if (force_nonblock) {
3881 /* If the file doesn't support async, just async punt */
35645ac3 3882 if (unlikely(!io_file_supports_nowait(req))) {
607b6fb8
PB
3883 ret = io_setup_async_rw(req, iovec, s, true);
3884 return ret ?: -EAGAIN;
3885 }
a88fc400 3886 kiocb->ki_flags |= IOCB_NOWAIT;
607b6fb8
PB
3887 } else {
3888 /* Ensure we clear previously set non-block flag */
3889 kiocb->ki_flags &= ~IOCB_NOWAIT;
6713e7a6 3890 }
9e645e11 3891
b4aec400 3892 ppos = io_kiocb_update_pos(req);
d34e1e5b 3893
cef216fc 3894 ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
5ea5dd45
PB
3895 if (unlikely(ret)) {
3896 kfree(iovec);
3897 return ret;
3898 }
2b188cc1 3899
c88598a9 3900 ret = io_iter_do_read(req, &s->iter);
32960613 3901
230d50d4 3902 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
6ad7f233 3903 req->flags &= ~REQ_F_REISSUE;
9af177ee
JA
3904 /* if we can poll, just do that */
3905 if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
3906 return -EAGAIN;
eefdf30f
JA
3907 /* IOPOLL retry should happen for io-wq threads */
3908 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
f91daf56 3909 goto done;
75c668cd
PB
3910 /* no retry on NONBLOCK nor RWF_NOWAIT */
3911 if (req->flags & REQ_F_NOWAIT)
355afaeb 3912 goto done;
f38c7e3a 3913 ret = 0;
230d50d4
JA
3914 } else if (ret == -EIOCBQUEUED) {
3915 goto out_free;
cef216fc 3916 } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
7db30437 3917 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
7335e3bf 3918 /* read all, failed, already did sync or don't want to retry */
00d23d51 3919 goto done;
227c0c96
JA
3920 }
3921
cd658695
JA
3922 /*
3923 * Don't depend on the iter state matching what was consumed, or being
3924 * untouched in case of error. Restore it and we'll advance it
3925 * manually if we need to.
3926 */
c88598a9 3927 iov_iter_restore(&s->iter, &s->iter_state);
cd658695 3928
c88598a9 3929 ret2 = io_setup_async_rw(req, iovec, s, true);
6bf985dc
PB
3930 if (ret2)
3931 return ret2;
3932
fe1cdd55 3933 iovec = NULL;
e8c2bc1f 3934 rw = req->async_data;
c88598a9 3935 s = &rw->s;
cd658695
JA
3936 /*
3937 * Now use our persistent iterator and state, if we aren't already.
3938 * We've restored and mapped the iter to match.
3939 */
227c0c96 3940
b23df91b 3941 do {
cd658695
JA
3942 /*
3943 * We end up here because of a partial read, either from
3944 * above or inside this loop. Advance the iter by the bytes
3945 * that were consumed.
3946 */
c88598a9
PB
3947 iov_iter_advance(&s->iter, ret);
3948 if (!iov_iter_count(&s->iter))
cd658695 3949 break;
b23df91b 3950 rw->bytes_done += ret;
c88598a9 3951 iov_iter_save_state(&s->iter, &s->iter_state);
cd658695 3952
b23df91b
PB
3953 /* if we can retry, do so with the callbacks armed */
3954 if (!io_rw_should_retry(req)) {
3955 kiocb->ki_flags &= ~IOCB_WAITQ;
3956 return -EAGAIN;
3957 }
3958
3959 /*
3960 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3961 * we get -EIOCBQUEUED, then we'll get a notification when the
3962 * desired page gets unlocked. We can also get a partial read
3963 * here, and if we do, then just retry at the new offset.
3964 */
c88598a9 3965 ret = io_iter_do_read(req, &s->iter);
b23df91b
PB
3966 if (ret == -EIOCBQUEUED)
3967 return 0;
227c0c96 3968 /* we got some bytes, but not all. retry. */
b5b0ecb7 3969 kiocb->ki_flags &= ~IOCB_WAITQ;
c88598a9 3970 iov_iter_restore(&s->iter, &s->iter_state);
cd658695 3971 } while (ret > 0);
227c0c96 3972done:
2ea537ca 3973 kiocb_done(req, ret, issue_flags);
fe1cdd55
PB
3974out_free:
3975 /* it's faster to check here then delegate to kfree */
3976 if (iovec)
3977 kfree(iovec);
5ea5dd45 3978 return 0;
2b188cc1
JA
3979}
3980
889fca73 3981static int io_write(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 3982{
607b6fb8 3983 struct io_rw_state __s, *s = &__s;
c88598a9 3984 struct iovec *iovec;
9adbd45d 3985 struct kiocb *kiocb = &req->rw.kiocb;
45d189c6 3986 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
cd658695 3987 ssize_t ret, ret2;
b4aec400 3988 loff_t *ppos;
2b188cc1 3989
607b6fb8 3990 if (!req_has_async_data(req)) {
5e49c973
PB
3991 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
3992 if (unlikely(ret < 0))
2846c481 3993 return ret;
607b6fb8
PB
3994 } else {
3995 struct io_async_rw *rw = req->async_data;
3996
3997 s = &rw->s;
3998 iov_iter_restore(&s->iter, &s->iter_state);
2846c481 3999 iovec = NULL;
2846c481 4000 }
584b0180 4001 ret = io_rw_init_file(req, FMODE_WRITE);
323b190b
JA
4002 if (unlikely(ret)) {
4003 kfree(iovec);
584b0180 4004 return ret;
323b190b 4005 }
cef216fc 4006 req->cqe.res = iov_iter_count(&s->iter);
2b188cc1 4007
607b6fb8
PB
4008 if (force_nonblock) {
4009 /* If the file doesn't support async, just async punt */
35645ac3 4010 if (unlikely(!io_file_supports_nowait(req)))
607b6fb8 4011 goto copy_iov;
fd6c2e4c 4012
607b6fb8
PB
4013 /* file path doesn't support NOWAIT for non-direct_IO */
4014 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
4015 (req->flags & REQ_F_ISREG))
4016 goto copy_iov;
31b51510 4017
607b6fb8
PB
4018 kiocb->ki_flags |= IOCB_NOWAIT;
4019 } else {
4020 /* Ensure we clear previously set non-block flag */
4021 kiocb->ki_flags &= ~IOCB_NOWAIT;
4022 }
31b51510 4023
b4aec400 4024 ppos = io_kiocb_update_pos(req);
d34e1e5b 4025
cef216fc 4026 ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
fa15bafb
PB
4027 if (unlikely(ret))
4028 goto out_free;
4ed734b0 4029
fa15bafb
PB
4030 /*
4031 * Open-code file_start_write here to grab freeze protection,
4032 * which will be released by another thread in
4033 * io_complete_rw(). Fool lockdep by telling it the lock got
4034 * released so that it doesn't complain about the held lock when
4035 * we return to userspace.
4036 */
4037 if (req->flags & REQ_F_ISREG) {
8a3c84b6 4038 sb_start_write(file_inode(req->file)->i_sb);
fa15bafb
PB
4039 __sb_writers_release(file_inode(req->file)->i_sb,
4040 SB_FREEZE_WRITE);
4041 }
4042 kiocb->ki_flags |= IOCB_WRITE;
4ed734b0 4043
35645ac3 4044 if (likely(req->file->f_op->write_iter))
c88598a9 4045 ret2 = call_write_iter(req->file, kiocb, &s->iter);
2dd2111d 4046 else if (req->file->f_op->write)
c88598a9 4047 ret2 = loop_rw_iter(WRITE, req, &s->iter);
2dd2111d
GH
4048 else
4049 ret2 = -EINVAL;
4ed734b0 4050
6ad7f233
PB
4051 if (req->flags & REQ_F_REISSUE) {
4052 req->flags &= ~REQ_F_REISSUE;
230d50d4 4053 ret2 = -EAGAIN;
6ad7f233 4054 }
230d50d4 4055
fa15bafb
PB
4056 /*
4057 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
4058 * retry them without IOCB_NOWAIT.
4059 */
4060 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
4061 ret2 = -EAGAIN;
75c668cd
PB
4062 /* no retry on NONBLOCK nor RWF_NOWAIT */
4063 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
355afaeb 4064 goto done;
fa15bafb 4065 if (!force_nonblock || ret2 != -EAGAIN) {
eefdf30f 4066 /* IOPOLL retry should happen for io-wq threads */
b10841c9 4067 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
eefdf30f 4068 goto copy_iov;
355afaeb 4069done:
2ea537ca 4070 kiocb_done(req, ret2, issue_flags);
fa15bafb 4071 } else {
f67676d1 4072copy_iov:
c88598a9
PB
4073 iov_iter_restore(&s->iter, &s->iter_state);
4074 ret = io_setup_async_rw(req, iovec, s, false);
6bf985dc 4075 return ret ?: -EAGAIN;
2b188cc1 4076 }
31b51510 4077out_free:
f261c168 4078 /* it's reportedly faster than delegating the null check to kfree() */
252917c3 4079 if (iovec)
6f2cc166 4080 kfree(iovec);
2b188cc1
JA
4081 return ret;
4082}
4083
80a261fd
JA
4084static int io_renameat_prep(struct io_kiocb *req,
4085 const struct io_uring_sqe *sqe)
4086{
4087 struct io_rename *ren = &req->rename;
4088 const char __user *oldf, *newf;
4089
ed7eb259
JA
4090 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4091 return -EINVAL;
26578cda 4092 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
ed7eb259 4093 return -EINVAL;
80a261fd
JA
4094 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4095 return -EBADF;
4096
4097 ren->old_dfd = READ_ONCE(sqe->fd);
4098 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4099 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4100 ren->new_dfd = READ_ONCE(sqe->len);
4101 ren->flags = READ_ONCE(sqe->rename_flags);
4102
4103 ren->oldpath = getname(oldf);
4104 if (IS_ERR(ren->oldpath))
4105 return PTR_ERR(ren->oldpath);
4106
4107 ren->newpath = getname(newf);
4108 if (IS_ERR(ren->newpath)) {
4109 putname(ren->oldpath);
4110 return PTR_ERR(ren->newpath);
4111 }
4112
4113 req->flags |= REQ_F_NEED_CLEANUP;
4114 return 0;
4115}
4116
45d189c6 4117static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
80a261fd
JA
4118{
4119 struct io_rename *ren = &req->rename;
4120 int ret;
4121
45d189c6 4122 if (issue_flags & IO_URING_F_NONBLOCK)
80a261fd
JA
4123 return -EAGAIN;
4124
4125 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
4126 ren->newpath, ren->flags);
4127
4128 req->flags &= ~REQ_F_NEED_CLEANUP;
4129 if (ret < 0)
93d2bcd2 4130 req_set_fail(req);
80a261fd
JA
4131 io_req_complete(req, ret);
4132 return 0;
4133}
4134
14a1143b
JA
4135static int io_unlinkat_prep(struct io_kiocb *req,
4136 const struct io_uring_sqe *sqe)
4137{
4138 struct io_unlink *un = &req->unlink;
4139 const char __user *fname;
4140
22634bc5
JA
4141 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4142 return -EINVAL;
26578cda
PB
4143 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4144 sqe->splice_fd_in)
22634bc5 4145 return -EINVAL;
14a1143b
JA
4146 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4147 return -EBADF;
4148
4149 un->dfd = READ_ONCE(sqe->fd);
4150
4151 un->flags = READ_ONCE(sqe->unlink_flags);
4152 if (un->flags & ~AT_REMOVEDIR)
4153 return -EINVAL;
4154
4155 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4156 un->filename = getname(fname);
4157 if (IS_ERR(un->filename))
4158 return PTR_ERR(un->filename);
4159
4160 req->flags |= REQ_F_NEED_CLEANUP;
4161 return 0;
4162}
4163
45d189c6 4164static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
14a1143b
JA
4165{
4166 struct io_unlink *un = &req->unlink;
4167 int ret;
4168
45d189c6 4169 if (issue_flags & IO_URING_F_NONBLOCK)
14a1143b
JA
4170 return -EAGAIN;
4171
4172 if (un->flags & AT_REMOVEDIR)
4173 ret = do_rmdir(un->dfd, un->filename);
4174 else
4175 ret = do_unlinkat(un->dfd, un->filename);
4176
4177 req->flags &= ~REQ_F_NEED_CLEANUP;
4178 if (ret < 0)
93d2bcd2 4179 req_set_fail(req);
14a1143b
JA
4180 io_req_complete(req, ret);
4181 return 0;
4182}
4183
e34a02dc
DK
4184static int io_mkdirat_prep(struct io_kiocb *req,
4185 const struct io_uring_sqe *sqe)
4186{
4187 struct io_mkdir *mkd = &req->mkdir;
4188 const char __user *fname;
4189
4190 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4191 return -EINVAL;
4192 if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
4193 sqe->splice_fd_in)
4194 return -EINVAL;
4195 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4196 return -EBADF;
4197
4198 mkd->dfd = READ_ONCE(sqe->fd);
4199 mkd->mode = READ_ONCE(sqe->len);
4200
4201 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4202 mkd->filename = getname(fname);
4203 if (IS_ERR(mkd->filename))
4204 return PTR_ERR(mkd->filename);
4205
4206 req->flags |= REQ_F_NEED_CLEANUP;
4207 return 0;
4208}
4209
04f34081 4210static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
e34a02dc
DK
4211{
4212 struct io_mkdir *mkd = &req->mkdir;
4213 int ret;
4214
4215 if (issue_flags & IO_URING_F_NONBLOCK)
4216 return -EAGAIN;
4217
4218 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
4219
4220 req->flags &= ~REQ_F_NEED_CLEANUP;
4221 if (ret < 0)
4222 req_set_fail(req);
4223 io_req_complete(req, ret);
4224 return 0;
4225}
4226
7a8721f8
DK
4227static int io_symlinkat_prep(struct io_kiocb *req,
4228 const struct io_uring_sqe *sqe)
4229{
4230 struct io_symlink *sl = &req->symlink;
4231 const char __user *oldpath, *newpath;
4232
4233 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4234 return -EINVAL;
4235 if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
4236 sqe->splice_fd_in)
4237 return -EINVAL;
4238 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4239 return -EBADF;
4240
4241 sl->new_dfd = READ_ONCE(sqe->fd);
4242 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
4243 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4244
4245 sl->oldpath = getname(oldpath);
4246 if (IS_ERR(sl->oldpath))
4247 return PTR_ERR(sl->oldpath);
4248
4249 sl->newpath = getname(newpath);
4250 if (IS_ERR(sl->newpath)) {
4251 putname(sl->oldpath);
4252 return PTR_ERR(sl->newpath);
4253 }
4254
4255 req->flags |= REQ_F_NEED_CLEANUP;
4256 return 0;
4257}
4258
04f34081 4259static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
7a8721f8
DK
4260{
4261 struct io_symlink *sl = &req->symlink;
4262 int ret;
4263
4264 if (issue_flags & IO_URING_F_NONBLOCK)
4265 return -EAGAIN;
4266
4267 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
4268
4269 req->flags &= ~REQ_F_NEED_CLEANUP;
4270 if (ret < 0)
4271 req_set_fail(req);
4272 io_req_complete(req, ret);
4273 return 0;
4274}
4275
cf30da90
DK
4276static int io_linkat_prep(struct io_kiocb *req,
4277 const struct io_uring_sqe *sqe)
4278{
4279 struct io_hardlink *lnk = &req->hardlink;
4280 const char __user *oldf, *newf;
4281
4282 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4283 return -EINVAL;
4284 if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
4285 return -EINVAL;
4286 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4287 return -EBADF;
4288
4289 lnk->old_dfd = READ_ONCE(sqe->fd);
4290 lnk->new_dfd = READ_ONCE(sqe->len);
4291 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4292 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4293 lnk->flags = READ_ONCE(sqe->hardlink_flags);
4294
4295 lnk->oldpath = getname(oldf);
4296 if (IS_ERR(lnk->oldpath))
4297 return PTR_ERR(lnk->oldpath);
4298
4299 lnk->newpath = getname(newf);
4300 if (IS_ERR(lnk->newpath)) {
4301 putname(lnk->oldpath);
4302 return PTR_ERR(lnk->newpath);
4303 }
4304
4305 req->flags |= REQ_F_NEED_CLEANUP;
4306 return 0;
4307}
4308
04f34081 4309static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
cf30da90
DK
4310{
4311 struct io_hardlink *lnk = &req->hardlink;
4312 int ret;
4313
4314 if (issue_flags & IO_URING_F_NONBLOCK)
4315 return -EAGAIN;
4316
4317 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
4318 lnk->newpath, lnk->flags);
4319
4320 req->flags &= ~REQ_F_NEED_CLEANUP;
4321 if (ret < 0)
4322 req_set_fail(req);
4323 io_req_complete(req, ret);
4324 return 0;
4325}
4326
36f4fa68
JA
4327static int io_shutdown_prep(struct io_kiocb *req,
4328 const struct io_uring_sqe *sqe)
4329{
4330#if defined(CONFIG_NET)
4331 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4332 return -EINVAL;
26578cda
PB
4333 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
4334 sqe->buf_index || sqe->splice_fd_in))
36f4fa68
JA
4335 return -EINVAL;
4336
4337 req->shutdown.how = READ_ONCE(sqe->len);
4338 return 0;
4339#else
4340 return -EOPNOTSUPP;
4341#endif
4342}
4343
45d189c6 4344static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
36f4fa68
JA
4345{
4346#if defined(CONFIG_NET)
4347 struct socket *sock;
4348 int ret;
4349
45d189c6 4350 if (issue_flags & IO_URING_F_NONBLOCK)
36f4fa68
JA
4351 return -EAGAIN;
4352
48aba79b 4353 sock = sock_from_file(req->file);
36f4fa68 4354 if (unlikely(!sock))
48aba79b 4355 return -ENOTSOCK;
36f4fa68
JA
4356
4357 ret = __sys_shutdown_sock(sock, req->shutdown.how);
a146468d 4358 if (ret < 0)
93d2bcd2 4359 req_set_fail(req);
36f4fa68
JA
4360 io_req_complete(req, ret);
4361 return 0;
4362#else
4363 return -EOPNOTSUPP;
4364#endif
4365}
4366
f2a8d5c7
PB
4367static int __io_splice_prep(struct io_kiocb *req,
4368 const struct io_uring_sqe *sqe)
7d67af2c 4369{
fe7e3257 4370 struct io_splice *sp = &req->splice;
7d67af2c 4371 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
7d67af2c 4372
3232dd02
PB
4373 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4374 return -EINVAL;
7d67af2c 4375
7d67af2c
PB
4376 sp->len = READ_ONCE(sqe->len);
4377 sp->flags = READ_ONCE(sqe->splice_flags);
7d67af2c
PB
4378 if (unlikely(sp->flags & ~valid_flags))
4379 return -EINVAL;
a3e4bc23 4380 sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
7d67af2c
PB
4381 return 0;
4382}
4383
f2a8d5c7
PB
4384static int io_tee_prep(struct io_kiocb *req,
4385 const struct io_uring_sqe *sqe)
4386{
4387 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
4388 return -EINVAL;
4389 return __io_splice_prep(req, sqe);
4390}
4391
45d189c6 4392static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
f2a8d5c7
PB
4393{
4394 struct io_splice *sp = &req->splice;
f2a8d5c7
PB
4395 struct file *out = sp->file_out;
4396 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
a3e4bc23 4397 struct file *in;
f2a8d5c7
PB
4398 long ret = 0;
4399
45d189c6 4400 if (issue_flags & IO_URING_F_NONBLOCK)
f2a8d5c7 4401 return -EAGAIN;
a3e4bc23 4402
5106dd6e 4403 if (sp->flags & SPLICE_F_FD_IN_FIXED)
e9419766 4404 in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
5106dd6e
JA
4405 else
4406 in = io_file_get_normal(req, sp->splice_fd_in);
a3e4bc23
JA
4407 if (!in) {
4408 ret = -EBADF;
4409 goto done;
4410 }
4411
f2a8d5c7
PB
4412 if (sp->len)
4413 ret = do_tee(in, out, sp->len, flags);
4414
e1d767f0
PB
4415 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4416 io_put_file(in);
a3e4bc23 4417done:
f2a8d5c7 4418 if (ret != sp->len)
93d2bcd2 4419 req_set_fail(req);
e1e16097 4420 io_req_complete(req, ret);
f2a8d5c7
PB
4421 return 0;
4422}
4423
4424static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4425{
fe7e3257 4426 struct io_splice *sp = &req->splice;
f2a8d5c7
PB
4427
4428 sp->off_in = READ_ONCE(sqe->splice_off_in);
4429 sp->off_out = READ_ONCE(sqe->off);
4430 return __io_splice_prep(req, sqe);
4431}
4432
45d189c6 4433static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
7d67af2c
PB
4434{
4435 struct io_splice *sp = &req->splice;
7d67af2c
PB
4436 struct file *out = sp->file_out;
4437 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4438 loff_t *poff_in, *poff_out;
a3e4bc23 4439 struct file *in;
c9687426 4440 long ret = 0;
7d67af2c 4441
45d189c6 4442 if (issue_flags & IO_URING_F_NONBLOCK)
2fb3e822 4443 return -EAGAIN;
7d67af2c 4444
5106dd6e 4445 if (sp->flags & SPLICE_F_FD_IN_FIXED)
e9419766 4446 in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
5106dd6e
JA
4447 else
4448 in = io_file_get_normal(req, sp->splice_fd_in);
a3e4bc23
JA
4449 if (!in) {
4450 ret = -EBADF;
4451 goto done;
4452 }
4453
7d67af2c
PB
4454 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4455 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
c9687426 4456
948a7749 4457 if (sp->len)
c9687426 4458 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
7d67af2c 4459
e1d767f0
PB
4460 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4461 io_put_file(in);
a3e4bc23 4462done:
7d67af2c 4463 if (ret != sp->len)
93d2bcd2 4464 req_set_fail(req);
e1e16097 4465 io_req_complete(req, ret);
7d67af2c
PB
4466 return 0;
4467}
4468
2b188cc1
JA
4469/*
4470 * IORING_OP_NOP just posts a completion event, nothing else.
4471 */
889fca73 4472static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
4473{
4474 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 4475
def596e9
JA
4476 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4477 return -EINVAL;
4478
889fca73 4479 __io_req_complete(req, issue_flags, 0, 0);
2b188cc1
JA
4480 return 0;
4481}
4482
4f57f06c
JA
4483static int io_msg_ring_prep(struct io_kiocb *req,
4484 const struct io_uring_sqe *sqe)
4485{
f3b6a41e
JA
4486 if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
4487 sqe->splice_fd_in || sqe->buf_index || sqe->personality))
4f57f06c
JA
4488 return -EINVAL;
4489
4f57f06c
JA
4490 req->msg.user_data = READ_ONCE(sqe->off);
4491 req->msg.len = READ_ONCE(sqe->len);
4492 return 0;
4493}
4494
4495static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
4496{
4497 struct io_ring_ctx *target_ctx;
4498 struct io_msg *msg = &req->msg;
4f57f06c 4499 bool filled;
3f1d52ab 4500 int ret;
4f57f06c 4501
3f1d52ab
JA
4502 ret = -EBADFD;
4503 if (req->file->f_op != &io_uring_fops)
4504 goto done;
4f57f06c 4505
3f1d52ab 4506 ret = -EOVERFLOW;
4f57f06c
JA
4507 target_ctx = req->file->private_data;
4508
4509 spin_lock(&target_ctx->completion_lock);
7ef66d18 4510 filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
4f57f06c
JA
4511 io_commit_cqring(target_ctx);
4512 spin_unlock(&target_ctx->completion_lock);
4513
4514 if (filled) {
4515 io_cqring_ev_posted(target_ctx);
4516 ret = 0;
4517 }
4518
3f1d52ab 4519done:
9666d420
JA
4520 if (ret < 0)
4521 req_set_fail(req);
4f57f06c
JA
4522 __io_req_complete(req, issue_flags, ret, 0);
4523 return 0;
4524}
4525
1155c76a 4526static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 4527{
6b06314c 4528 struct io_ring_ctx *ctx = req->ctx;
c992fe29 4529
6b06314c 4530 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 4531 return -EINVAL;
26578cda
PB
4532 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4533 sqe->splice_fd_in))
c992fe29
CH
4534 return -EINVAL;
4535
8ed8d3c3
JA
4536 req->sync.flags = READ_ONCE(sqe->fsync_flags);
4537 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4538 return -EINVAL;
4539
4540 req->sync.off = READ_ONCE(sqe->off);
4541 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
4542 return 0;
4543}
4544
45d189c6 4545static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4546{
8ed8d3c3 4547 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
4548 int ret;
4549
ac45abc0 4550 /* fsync always requires a blocking context */
45d189c6 4551 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4552 return -EAGAIN;
4553
9adbd45d 4554 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
4555 end > 0 ? end : LLONG_MAX,
4556 req->sync.flags & IORING_FSYNC_DATASYNC);
4557 if (ret < 0)
93d2bcd2 4558 req_set_fail(req);
e1e16097 4559 io_req_complete(req, ret);
c992fe29
CH
4560 return 0;
4561}
4562
d63d1b5e
JA
4563static int io_fallocate_prep(struct io_kiocb *req,
4564 const struct io_uring_sqe *sqe)
4565{
26578cda
PB
4566 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4567 sqe->splice_fd_in)
d63d1b5e 4568 return -EINVAL;
3232dd02
PB
4569 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4570 return -EINVAL;
d63d1b5e
JA
4571
4572 req->sync.off = READ_ONCE(sqe->off);
4573 req->sync.len = READ_ONCE(sqe->addr);
4574 req->sync.mode = READ_ONCE(sqe->len);
4575 return 0;
4576}
4577
45d189c6 4578static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
5d17b4a4 4579{
ac45abc0
PB
4580 int ret;
4581
d63d1b5e 4582 /* fallocate always requiring blocking context */
45d189c6 4583 if (issue_flags & IO_URING_F_NONBLOCK)
5d17b4a4 4584 return -EAGAIN;
ac45abc0
PB
4585 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4586 req->sync.len);
ac45abc0 4587 if (ret < 0)
93d2bcd2 4588 req_set_fail(req);
f63cf519
JA
4589 else
4590 fsnotify_modify(req->file);
e1e16097 4591 io_req_complete(req, ret);
5d17b4a4
JA
4592 return 0;
4593}
4594
ec65fea5 4595static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 4596{
f8748881 4597 const char __user *fname;
15b71abe 4598 int ret;
b7bb4f7d 4599
d3fddf6d
PB
4600 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4601 return -EINVAL;
b9445598 4602 if (unlikely(sqe->ioprio || sqe->buf_index))
15b71abe 4603 return -EINVAL;
ec65fea5 4604 if (unlikely(req->flags & REQ_F_FIXED_FILE))
cf3040ca 4605 return -EBADF;
03b1230c 4606
ec65fea5
PB
4607 /* open.how should be already initialised */
4608 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
08a1d26e 4609 req->open.how.flags |= O_LARGEFILE;
3529d8c2 4610
25e72d10
PB
4611 req->open.dfd = READ_ONCE(sqe->fd);
4612 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
f8748881 4613 req->open.filename = getname(fname);
15b71abe
JA
4614 if (IS_ERR(req->open.filename)) {
4615 ret = PTR_ERR(req->open.filename);
4616 req->open.filename = NULL;
4617 return ret;
4618 }
b9445598
PB
4619
4620 req->open.file_slot = READ_ONCE(sqe->file_index);
4621 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4622 return -EINVAL;
4623
4022e7af 4624 req->open.nofile = rlimit(RLIMIT_NOFILE);
8fef80bf 4625 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 4626 return 0;
03b1230c
JA
4627}
4628
ec65fea5
PB
4629static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4630{
d3fddf6d
PB
4631 u64 mode = READ_ONCE(sqe->len);
4632 u64 flags = READ_ONCE(sqe->open_flags);
ec65fea5 4633
ec65fea5
PB
4634 req->open.how = build_open_how(flags, mode);
4635 return __io_openat_prep(req, sqe);
4636}
4637
cebdb986 4638static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 4639{
cebdb986 4640 struct open_how __user *how;
cebdb986 4641 size_t len;
0fa03c62
JA
4642 int ret;
4643
cebdb986
JA
4644 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4645 len = READ_ONCE(sqe->len);
cebdb986
JA
4646 if (len < OPEN_HOW_SIZE_VER0)
4647 return -EINVAL;
3529d8c2 4648
cebdb986
JA
4649 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4650 len);
4651 if (ret)
4652 return ret;
3529d8c2 4653
ec65fea5 4654 return __io_openat_prep(req, sqe);
cebdb986
JA
4655}
4656
45d189c6 4657static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
15b71abe
JA
4658{
4659 struct open_flags op;
15b71abe 4660 struct file *file;
b9445598
PB
4661 bool resolve_nonblock, nonblock_set;
4662 bool fixed = !!req->open.file_slot;
15b71abe
JA
4663 int ret;
4664
cebdb986 4665 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
4666 if (ret)
4667 goto err;
3a81fd02
JA
4668 nonblock_set = op.open_flag & O_NONBLOCK;
4669 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
45d189c6 4670 if (issue_flags & IO_URING_F_NONBLOCK) {
3a81fd02
JA
4671 /*
4672 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4673 * it'll always -EAGAIN
4674 */
4675 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4676 return -EAGAIN;
4677 op.lookup_flags |= LOOKUP_CACHED;
4678 op.open_flag |= O_NONBLOCK;
4679 }
15b71abe 4680
b9445598
PB
4681 if (!fixed) {
4682 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4683 if (ret < 0)
4684 goto err;
4685 }
15b71abe
JA
4686
4687 file = do_filp_open(req->open.dfd, req->open.filename, &op);
12dcb58a 4688 if (IS_ERR(file)) {
944d1444 4689 /*
12dcb58a
PB
4690 * We could hang on to this 'fd' on retrying, but seems like
4691 * marginal gain for something that is now known to be a slower
4692 * path. So just put it, and we'll get a new one when we retry.
944d1444 4693 */
b9445598
PB
4694 if (!fixed)
4695 put_unused_fd(ret);
3a81fd02 4696
15b71abe 4697 ret = PTR_ERR(file);
12dcb58a
PB
4698 /* only retry if RESOLVE_CACHED wasn't already set by application */
4699 if (ret == -EAGAIN &&
4700 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4701 return -EAGAIN;
4702 goto err;
15b71abe 4703 }
12dcb58a
PB
4704
4705 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4706 file->f_flags &= ~O_NONBLOCK;
4707 fsnotify_open(file);
b9445598
PB
4708
4709 if (!fixed)
4710 fd_install(ret, file);
4711 else
4712 ret = io_install_fixed_file(req, file, issue_flags,
4713 req->open.file_slot - 1);
15b71abe
JA
4714err:
4715 putname(req->open.filename);
8fef80bf 4716 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe 4717 if (ret < 0)
93d2bcd2 4718 req_set_fail(req);
0bdf3398 4719 __io_req_complete(req, issue_flags, ret, 0);
15b71abe
JA
4720 return 0;
4721}
4722
45d189c6 4723static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
cebdb986 4724{
e45cff58 4725 return io_openat2(req, issue_flags);
cebdb986
JA
4726}
4727
067524e9
JA
4728static int io_remove_buffers_prep(struct io_kiocb *req,
4729 const struct io_uring_sqe *sqe)
4730{
4731 struct io_provide_buf *p = &req->pbuf;
4732 u64 tmp;
4733
26578cda
PB
4734 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4735 sqe->splice_fd_in)
067524e9
JA
4736 return -EINVAL;
4737
4738 tmp = READ_ONCE(sqe->fd);
4739 if (!tmp || tmp > USHRT_MAX)
4740 return -EINVAL;
4741
4742 memset(p, 0, sizeof(*p));
4743 p->nbufs = tmp;
4744 p->bgid = READ_ONCE(sqe->buf_group);
4745 return 0;
4746}
4747
dbc7d452
JA
4748static int __io_remove_buffers(struct io_ring_ctx *ctx,
4749 struct io_buffer_list *bl, unsigned nbufs)
067524e9
JA
4750{
4751 unsigned i = 0;
4752
4753 /* shouldn't happen */
4754 if (!nbufs)
4755 return 0;
4756
4757 /* the head kbuf is the list itself */
dbc7d452 4758 while (!list_empty(&bl->buf_list)) {
067524e9
JA
4759 struct io_buffer *nxt;
4760
dbc7d452 4761 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
067524e9 4762 list_del(&nxt->list);
067524e9
JA
4763 if (++i == nbufs)
4764 return i;
1d0254e6 4765 cond_resched();
067524e9
JA
4766 }
4767 i++;
067524e9
JA
4768
4769 return i;
4770}
4771
889fca73 4772static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
067524e9
JA
4773{
4774 struct io_provide_buf *p = &req->pbuf;
4775 struct io_ring_ctx *ctx = req->ctx;
dbc7d452 4776 struct io_buffer_list *bl;
067524e9
JA
4777 int ret = 0;
4778
f8929630 4779 io_ring_submit_lock(ctx, issue_flags);
067524e9
JA
4780
4781 ret = -ENOENT;
dbc7d452
JA
4782 bl = io_buffer_get_list(ctx, p->bgid);
4783 if (bl)
4784 ret = __io_remove_buffers(ctx, bl, p->nbufs);
067524e9 4785 if (ret < 0)
93d2bcd2 4786 req_set_fail(req);
067524e9 4787
9fb8cb49
PB
4788 /* complete before unlock, IOPOLL may need the lock */
4789 __io_req_complete(req, issue_flags, ret, 0);
f8929630 4790 io_ring_submit_unlock(ctx, issue_flags);
067524e9
JA
4791 return 0;
4792}
4793
ddf0322d
JA
4794static int io_provide_buffers_prep(struct io_kiocb *req,
4795 const struct io_uring_sqe *sqe)
4796{
38134ada 4797 unsigned long size, tmp_check;
ddf0322d
JA
4798 struct io_provide_buf *p = &req->pbuf;
4799 u64 tmp;
4800
26578cda 4801 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
ddf0322d
JA
4802 return -EINVAL;
4803
4804 tmp = READ_ONCE(sqe->fd);
4805 if (!tmp || tmp > USHRT_MAX)
4806 return -E2BIG;
4807 p->nbufs = tmp;
4808 p->addr = READ_ONCE(sqe->addr);
4809 p->len = READ_ONCE(sqe->len);
4810
38134ada
PB
4811 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4812 &size))
4813 return -EOVERFLOW;
4814 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4815 return -EOVERFLOW;
4816
d81269fe
PB
4817 size = (unsigned long)p->len * p->nbufs;
4818 if (!access_ok(u64_to_user_ptr(p->addr), size))
ddf0322d
JA
4819 return -EFAULT;
4820
4821 p->bgid = READ_ONCE(sqe->buf_group);
4822 tmp = READ_ONCE(sqe->off);
4823 if (tmp > USHRT_MAX)
4824 return -E2BIG;
4825 p->bid = tmp;
4826 return 0;
4827}
4828
cc3cec83
JA
4829static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
4830{
4831 struct io_buffer *buf;
4832 struct page *page;
4833 int bufs_in_page;
4834
4835 /*
4836 * Completions that don't happen inline (eg not under uring_lock) will
4837 * add to ->io_buffers_comp. If we don't have any free buffers, check
4838 * the completion list and splice those entries first.
4839 */
4840 if (!list_empty_careful(&ctx->io_buffers_comp)) {
4841 spin_lock(&ctx->completion_lock);
4842 if (!list_empty(&ctx->io_buffers_comp)) {
4843 list_splice_init(&ctx->io_buffers_comp,
4844 &ctx->io_buffers_cache);
4845 spin_unlock(&ctx->completion_lock);
4846 return 0;
4847 }
4848 spin_unlock(&ctx->completion_lock);
4849 }
4850
4851 /*
4852 * No free buffers and no completion entries either. Allocate a new
4853 * page worth of buffer entries and add those to our freelist.
4854 */
4855 page = alloc_page(GFP_KERNEL_ACCOUNT);
4856 if (!page)
4857 return -ENOMEM;
4858
4859 list_add(&page->lru, &ctx->io_buffers_pages);
4860
4861 buf = page_address(page);
4862 bufs_in_page = PAGE_SIZE / sizeof(*buf);
4863 while (bufs_in_page) {
4864 list_add_tail(&buf->list, &ctx->io_buffers_cache);
4865 buf++;
4866 bufs_in_page--;
4867 }
4868
4869 return 0;
4870}
4871
4872static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
dbc7d452 4873 struct io_buffer_list *bl)
ddf0322d
JA
4874{
4875 struct io_buffer *buf;
4876 u64 addr = pbuf->addr;
4877 int i, bid = pbuf->bid;
4878
4879 for (i = 0; i < pbuf->nbufs; i++) {
cc3cec83
JA
4880 if (list_empty(&ctx->io_buffers_cache) &&
4881 io_refill_buffer_cache(ctx))
ddf0322d 4882 break;
cc3cec83
JA
4883 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
4884 list);
dbc7d452 4885 list_move_tail(&buf->list, &bl->buf_list);
ddf0322d 4886 buf->addr = addr;
d1f82808 4887 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
ddf0322d 4888 buf->bid = bid;
b1c62645 4889 buf->bgid = pbuf->bgid;
ddf0322d
JA
4890 addr += pbuf->len;
4891 bid++;
f240762f 4892 cond_resched();
ddf0322d
JA
4893 }
4894
dbc7d452 4895 return i ? 0 : -ENOMEM;
ddf0322d
JA
4896}
4897
889fca73 4898static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
ddf0322d
JA
4899{
4900 struct io_provide_buf *p = &req->pbuf;
4901 struct io_ring_ctx *ctx = req->ctx;
dbc7d452 4902 struct io_buffer_list *bl;
ddf0322d 4903 int ret = 0;
ddf0322d 4904
f8929630 4905 io_ring_submit_lock(ctx, issue_flags);
ddf0322d 4906
dbc7d452
JA
4907 bl = io_buffer_get_list(ctx, p->bgid);
4908 if (unlikely(!bl)) {
4909 bl = kmalloc(sizeof(*bl), GFP_KERNEL);
4910 if (!bl) {
4911 ret = -ENOMEM;
4912 goto err;
4913 }
4914 io_buffer_add_list(ctx, bl, p->bgid);
ddf0322d 4915 }
dbc7d452
JA
4916
4917 ret = io_add_buffers(ctx, p, bl);
4918err:
ddf0322d 4919 if (ret < 0)
93d2bcd2 4920 req_set_fail(req);
9fb8cb49
PB
4921 /* complete before unlock, IOPOLL may need the lock */
4922 __io_req_complete(req, issue_flags, ret, 0);
f8929630 4923 io_ring_submit_unlock(ctx, issue_flags);
ddf0322d 4924 return 0;
cebdb986
JA
4925}
4926
3e4827b0
JA
4927static int io_epoll_ctl_prep(struct io_kiocb *req,
4928 const struct io_uring_sqe *sqe)
4929{
4930#if defined(CONFIG_EPOLL)
26578cda 4931 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3e4827b0 4932 return -EINVAL;
2d74d042 4933 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4934 return -EINVAL;
3e4827b0
JA
4935
4936 req->epoll.epfd = READ_ONCE(sqe->fd);
4937 req->epoll.op = READ_ONCE(sqe->len);
4938 req->epoll.fd = READ_ONCE(sqe->off);
4939
4940 if (ep_op_has_event(req->epoll.op)) {
4941 struct epoll_event __user *ev;
4942
4943 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4944 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4945 return -EFAULT;
4946 }
4947
4948 return 0;
4949#else
4950 return -EOPNOTSUPP;
4951#endif
4952}
4953
889fca73 4954static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
3e4827b0
JA
4955{
4956#if defined(CONFIG_EPOLL)
4957 struct io_epoll *ie = &req->epoll;
4958 int ret;
45d189c6 4959 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3e4827b0
JA
4960
4961 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4962 if (force_nonblock && ret == -EAGAIN)
4963 return -EAGAIN;
4964
4965 if (ret < 0)
93d2bcd2 4966 req_set_fail(req);
889fca73 4967 __io_req_complete(req, issue_flags, ret, 0);
3e4827b0
JA
4968 return 0;
4969#else
4970 return -EOPNOTSUPP;
4971#endif
4972}
4973
c1ca757b
JA
4974static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4975{
4976#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
26578cda 4977 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
c1ca757b 4978 return -EINVAL;
3232dd02
PB
4979 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4980 return -EINVAL;
c1ca757b
JA
4981
4982 req->madvise.addr = READ_ONCE(sqe->addr);
4983 req->madvise.len = READ_ONCE(sqe->len);
4984 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4985 return 0;
4986#else
4987 return -EOPNOTSUPP;
4988#endif
4989}
4990
45d189c6 4991static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
c1ca757b
JA
4992{
4993#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4994 struct io_madvise *ma = &req->madvise;
4995 int ret;
4996
45d189c6 4997 if (issue_flags & IO_URING_F_NONBLOCK)
c1ca757b
JA
4998 return -EAGAIN;
4999
0726b01e 5000 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
c1ca757b 5001 if (ret < 0)
93d2bcd2 5002 req_set_fail(req);
e1e16097 5003 io_req_complete(req, ret);
c1ca757b
JA
5004 return 0;
5005#else
5006 return -EOPNOTSUPP;
5007#endif
5008}
5009
4840e418
JA
5010static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5011{
26578cda 5012 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4840e418 5013 return -EINVAL;
3232dd02
PB
5014 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5015 return -EINVAL;
4840e418
JA
5016
5017 req->fadvise.offset = READ_ONCE(sqe->off);
5018 req->fadvise.len = READ_ONCE(sqe->len);
5019 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
5020 return 0;
5021}
5022
45d189c6 5023static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4840e418
JA
5024{
5025 struct io_fadvise *fa = &req->fadvise;
5026 int ret;
5027
45d189c6 5028 if (issue_flags & IO_URING_F_NONBLOCK) {
3e69426d
JA
5029 switch (fa->advice) {
5030 case POSIX_FADV_NORMAL:
5031 case POSIX_FADV_RANDOM:
5032 case POSIX_FADV_SEQUENTIAL:
5033 break;
5034 default:
5035 return -EAGAIN;
5036 }
5037 }
4840e418
JA
5038
5039 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
5040 if (ret < 0)
93d2bcd2 5041 req_set_fail(req);
0bdf3398 5042 __io_req_complete(req, issue_flags, ret, 0);
4840e418
JA
5043 return 0;
5044}
5045
eddc7ef5
JA
5046static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5047{
1b6fe6e0
SR
5048 const char __user *path;
5049
2d74d042 5050 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 5051 return -EINVAL;
26578cda 5052 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
eddc7ef5 5053 return -EINVAL;
9c280f90 5054 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 5055 return -EBADF;
eddc7ef5 5056
1d9e1288
BM
5057 req->statx.dfd = READ_ONCE(sqe->fd);
5058 req->statx.mask = READ_ONCE(sqe->len);
1b6fe6e0 5059 path = u64_to_user_ptr(READ_ONCE(sqe->addr));
1d9e1288
BM
5060 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5061 req->statx.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5 5062
1b6fe6e0
SR
5063 req->statx.filename = getname_flags(path,
5064 getname_statx_lookup_flags(req->statx.flags),
5065 NULL);
5066
5067 if (IS_ERR(req->statx.filename)) {
5068 int ret = PTR_ERR(req->statx.filename);
5069
5070 req->statx.filename = NULL;
5071 return ret;
5072 }
5073
5074 req->flags |= REQ_F_NEED_CLEANUP;
eddc7ef5
JA
5075 return 0;
5076}
5077
45d189c6 5078static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
eddc7ef5 5079{
1d9e1288 5080 struct io_statx *ctx = &req->statx;
eddc7ef5
JA
5081 int ret;
5082
59d70013 5083 if (issue_flags & IO_URING_F_NONBLOCK)
eddc7ef5
JA
5084 return -EAGAIN;
5085
e62753e4
BM
5086 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
5087 ctx->buffer);
eddc7ef5 5088
eddc7ef5 5089 if (ret < 0)
93d2bcd2 5090 req_set_fail(req);
e1e16097 5091 io_req_complete(req, ret);
eddc7ef5
JA
5092 return 0;
5093}
5094
b5dba59e
JA
5095static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5096{
14587a46 5097 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 5098 return -EINVAL;
b5dba59e 5099 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
7df778be 5100 sqe->rw_flags || sqe->buf_index)
b5dba59e 5101 return -EINVAL;
9c280f90 5102 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 5103 return -EBADF;
b5dba59e
JA
5104
5105 req->close.fd = READ_ONCE(sqe->fd);
7df778be
PB
5106 req->close.file_slot = READ_ONCE(sqe->file_index);
5107 if (req->close.file_slot && req->close.fd)
5108 return -EINVAL;
5109
b5dba59e 5110 return 0;
b5dba59e
JA
5111}
5112
889fca73 5113static int io_close(struct io_kiocb *req, unsigned int issue_flags)
b5dba59e 5114{
9eac1904 5115 struct files_struct *files = current->files;
3af73b28 5116 struct io_close *close = &req->close;
9eac1904 5117 struct fdtable *fdt;
a1fde923
PB
5118 struct file *file = NULL;
5119 int ret = -EBADF;
b5dba59e 5120
7df778be
PB
5121 if (req->close.file_slot) {
5122 ret = io_close_fixed(req, issue_flags);
5123 goto err;
5124 }
5125
9eac1904
JA
5126 spin_lock(&files->file_lock);
5127 fdt = files_fdtable(files);
5128 if (close->fd >= fdt->max_fds) {
5129 spin_unlock(&files->file_lock);
5130 goto err;
5131 }
5132 file = fdt->fd[close->fd];
a1fde923 5133 if (!file || file->f_op == &io_uring_fops) {
9eac1904
JA
5134 spin_unlock(&files->file_lock);
5135 file = NULL;
5136 goto err;
3af73b28 5137 }
b5dba59e
JA
5138
5139 /* if the file has a flush method, be safe and punt to async */
45d189c6 5140 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
9eac1904 5141 spin_unlock(&files->file_lock);
0bf0eefd 5142 return -EAGAIN;
a2100672 5143 }
b5dba59e 5144
9eac1904
JA
5145 ret = __close_fd_get_file(close->fd, &file);
5146 spin_unlock(&files->file_lock);
5147 if (ret < 0) {
5148 if (ret == -ENOENT)
5149 ret = -EBADF;
5150 goto err;
5151 }
5152
3af73b28 5153 /* No ->flush() or already async, safely close from here */
9eac1904
JA
5154 ret = filp_close(file, current->files);
5155err:
3af73b28 5156 if (ret < 0)
93d2bcd2 5157 req_set_fail(req);
9eac1904
JA
5158 if (file)
5159 fput(file);
889fca73 5160 __io_req_complete(req, issue_flags, ret, 0);
1a417f4e 5161 return 0;
b5dba59e
JA
5162}
5163
1155c76a 5164static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
5165{
5166 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4 5167
5d17b4a4
JA
5168 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
5169 return -EINVAL;
26578cda
PB
5170 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
5171 sqe->splice_fd_in))
5d17b4a4
JA
5172 return -EINVAL;
5173
8ed8d3c3
JA
5174 req->sync.off = READ_ONCE(sqe->off);
5175 req->sync.len = READ_ONCE(sqe->len);
5176 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
5177 return 0;
5178}
5179
45d189c6 5180static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 5181{
8ed8d3c3
JA
5182 int ret;
5183
ac45abc0 5184 /* sync_file_range always requires a blocking context */
45d189c6 5185 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
5186 return -EAGAIN;
5187
9adbd45d 5188 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
5189 req->sync.flags);
5190 if (ret < 0)
93d2bcd2 5191 req_set_fail(req);
e1e16097 5192 io_req_complete(req, ret);
5d17b4a4
JA
5193 return 0;
5194}
5195
469956e8 5196#if defined(CONFIG_NET)
02d27d89
PB
5197static int io_setup_async_msg(struct io_kiocb *req,
5198 struct io_async_msghdr *kmsg)
5199{
e8c2bc1f
JA
5200 struct io_async_msghdr *async_msg = req->async_data;
5201
5202 if (async_msg)
02d27d89 5203 return -EAGAIN;
e8c2bc1f 5204 if (io_alloc_async_data(req)) {
257e84a5 5205 kfree(kmsg->free_iov);
02d27d89
PB
5206 return -ENOMEM;
5207 }
e8c2bc1f 5208 async_msg = req->async_data;
02d27d89 5209 req->flags |= REQ_F_NEED_CLEANUP;
e8c2bc1f 5210 memcpy(async_msg, kmsg, sizeof(*kmsg));
2a780802 5211 async_msg->msg.msg_name = &async_msg->addr;
257e84a5
PB
5212 /* if were using fast_iov, set it to the new one */
5213 if (!async_msg->free_iov)
5214 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
5215
02d27d89
PB
5216 return -EAGAIN;
5217}
5218
2ae523ed
PB
5219static int io_sendmsg_copy_hdr(struct io_kiocb *req,
5220 struct io_async_msghdr *iomsg)
5221{
2ae523ed 5222 iomsg->msg.msg_name = &iomsg->addr;
257e84a5 5223 iomsg->free_iov = iomsg->fast_iov;
2ae523ed 5224 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
257e84a5 5225 req->sr_msg.msg_flags, &iomsg->free_iov);
2ae523ed
PB
5226}
5227
93642ef8
PB
5228static int io_sendmsg_prep_async(struct io_kiocb *req)
5229{
5230 int ret;
5231
93642ef8
PB
5232 ret = io_sendmsg_copy_hdr(req, req->async_data);
5233 if (!ret)
5234 req->flags |= REQ_F_NEED_CLEANUP;
5235 return ret;
5236}
5237
3529d8c2 5238static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 5239{
e47293fd 5240 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 5241
d2b6f48b
PB
5242 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5243 return -EINVAL;
5244
270a5940 5245 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 5246 sr->len = READ_ONCE(sqe->len);
04411806
PB
5247 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5248 if (sr->msg_flags & MSG_DONTWAIT)
5249 req->flags |= REQ_F_NOWAIT;
3529d8c2 5250
d8768362
JA
5251#ifdef CONFIG_COMPAT
5252 if (req->ctx->compat)
5253 sr->msg_flags |= MSG_CMSG_COMPAT;
5254#endif
93642ef8 5255 return 0;
03b1230c
JA
5256}
5257
889fca73 5258static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 5259{
6b754c8b 5260 struct io_async_msghdr iomsg, *kmsg;
0fa03c62 5261 struct socket *sock;
7a7cacba 5262 unsigned flags;
0031275d 5263 int min_ret = 0;
0fa03c62
JA
5264 int ret;
5265
dba4a925 5266 sock = sock_from_file(req->file);
7a7cacba 5267 if (unlikely(!sock))
dba4a925 5268 return -ENOTSOCK;
3529d8c2 5269
d886e185
PB
5270 if (req_has_async_data(req)) {
5271 kmsg = req->async_data;
5272 } else {
7a7cacba
PB
5273 ret = io_sendmsg_copy_hdr(req, &iomsg);
5274 if (ret)
5275 return ret;
5276 kmsg = &iomsg;
0fa03c62 5277 }
0fa03c62 5278
04411806
PB
5279 flags = req->sr_msg.msg_flags;
5280 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 5281 flags |= MSG_DONTWAIT;
0031275d
SM
5282 if (flags & MSG_WAITALL)
5283 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5284
7a7cacba 5285 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
0fa03c62 5286
7297ce3d
PB
5287 if (ret < min_ret) {
5288 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
5289 return io_setup_async_msg(req, kmsg);
5290 if (ret == -ERESTARTSYS)
5291 ret = -EINTR;
5292 req_set_fail(req);
5293 }
257e84a5
PB
5294 /* fast path, check for non-NULL to avoid function call */
5295 if (kmsg->free_iov)
5296 kfree(kmsg->free_iov);
99bc4c38 5297 req->flags &= ~REQ_F_NEED_CLEANUP;
889fca73 5298 __io_req_complete(req, issue_flags, ret, 0);
5d17b4a4 5299 return 0;
03b1230c 5300}
aa1fa28f 5301
889fca73 5302static int io_send(struct io_kiocb *req, unsigned int issue_flags)
fddaface 5303{
7a7cacba
PB
5304 struct io_sr_msg *sr = &req->sr_msg;
5305 struct msghdr msg;
5306 struct iovec iov;
fddaface 5307 struct socket *sock;
7a7cacba 5308 unsigned flags;
0031275d 5309 int min_ret = 0;
fddaface
JA
5310 int ret;
5311
dba4a925 5312 sock = sock_from_file(req->file);
7a7cacba 5313 if (unlikely(!sock))
dba4a925 5314 return -ENOTSOCK;
fddaface 5315
7a7cacba
PB
5316 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
5317 if (unlikely(ret))
14db8411 5318 return ret;
fddaface 5319
7a7cacba
PB
5320 msg.msg_name = NULL;
5321 msg.msg_control = NULL;
5322 msg.msg_controllen = 0;
5323 msg.msg_namelen = 0;
fddaface 5324
04411806
PB
5325 flags = req->sr_msg.msg_flags;
5326 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 5327 flags |= MSG_DONTWAIT;
0031275d
SM
5328 if (flags & MSG_WAITALL)
5329 min_ret = iov_iter_count(&msg.msg_iter);
5330
7a7cacba
PB
5331 msg.msg_flags = flags;
5332 ret = sock_sendmsg(sock, &msg);
7297ce3d
PB
5333 if (ret < min_ret) {
5334 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
5335 return -EAGAIN;
5336 if (ret == -ERESTARTSYS)
5337 ret = -EINTR;
93d2bcd2 5338 req_set_fail(req);
7297ce3d 5339 }
889fca73 5340 __io_req_complete(req, issue_flags, ret, 0);
fddaface 5341 return 0;
fddaface
JA
5342}
5343
1400e697
PB
5344static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
5345 struct io_async_msghdr *iomsg)
52de1fe1
JA
5346{
5347 struct io_sr_msg *sr = &req->sr_msg;
5348 struct iovec __user *uiov;
5349 size_t iov_len;
5350 int ret;
5351
1400e697
PB
5352 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
5353 &iomsg->uaddr, &uiov, &iov_len);
52de1fe1
JA
5354 if (ret)
5355 return ret;
5356
5357 if (req->flags & REQ_F_BUFFER_SELECT) {
5358 if (iov_len > 1)
5359 return -EINVAL;
5476dfed 5360 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
52de1fe1 5361 return -EFAULT;
5476dfed 5362 sr->len = iomsg->fast_iov[0].iov_len;
257e84a5 5363 iomsg->free_iov = NULL;
52de1fe1 5364 } else {
257e84a5 5365 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 5366 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
257e84a5 5367 &iomsg->free_iov, &iomsg->msg.msg_iter,
89cd35c5 5368 false);
52de1fe1
JA
5369 if (ret > 0)
5370 ret = 0;
5371 }
5372
5373 return ret;
5374}
5375
5376#ifdef CONFIG_COMPAT
5377static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
1400e697 5378 struct io_async_msghdr *iomsg)
52de1fe1 5379{
52de1fe1
JA
5380 struct io_sr_msg *sr = &req->sr_msg;
5381 struct compat_iovec __user *uiov;
5382 compat_uptr_t ptr;
5383 compat_size_t len;
5384 int ret;
5385
4af3417a
PB
5386 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
5387 &ptr, &len);
52de1fe1
JA
5388 if (ret)
5389 return ret;
5390
5391 uiov = compat_ptr(ptr);
5392 if (req->flags & REQ_F_BUFFER_SELECT) {
5393 compat_ssize_t clen;
5394
5395 if (len > 1)
5396 return -EINVAL;
5397 if (!access_ok(uiov, sizeof(*uiov)))
5398 return -EFAULT;
5399 if (__get_user(clen, &uiov->iov_len))
5400 return -EFAULT;
5401 if (clen < 0)
5402 return -EINVAL;
2d280bc8 5403 sr->len = clen;
257e84a5 5404 iomsg->free_iov = NULL;
52de1fe1 5405 } else {
257e84a5 5406 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 5407 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
257e84a5 5408 UIO_FASTIOV, &iomsg->free_iov,
89cd35c5 5409 &iomsg->msg.msg_iter, true);
52de1fe1
JA
5410 if (ret < 0)
5411 return ret;
5412 }
5413
5414 return 0;
5415}
5416#endif
5417
1400e697
PB
5418static int io_recvmsg_copy_hdr(struct io_kiocb *req,
5419 struct io_async_msghdr *iomsg)
52de1fe1 5420{
1400e697 5421 iomsg->msg.msg_name = &iomsg->addr;
52de1fe1
JA
5422
5423#ifdef CONFIG_COMPAT
5424 if (req->ctx->compat)
1400e697 5425 return __io_compat_recvmsg_copy_hdr(req, iomsg);
fddaface 5426#endif
52de1fe1 5427
1400e697 5428 return __io_recvmsg_copy_hdr(req, iomsg);
52de1fe1
JA
5429}
5430
bcda7baa 5431static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
51aac424 5432 unsigned int issue_flags)
bcda7baa
JA
5433{
5434 struct io_sr_msg *sr = &req->sr_msg;
bcda7baa 5435
51aac424 5436 return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
fddaface
JA
5437}
5438
93642ef8 5439static int io_recvmsg_prep_async(struct io_kiocb *req)
aa1fa28f 5440{
99bc4c38 5441 int ret;
3529d8c2 5442
93642ef8
PB
5443 ret = io_recvmsg_copy_hdr(req, req->async_data);
5444 if (!ret)
5445 req->flags |= REQ_F_NEED_CLEANUP;
5446 return ret;
5447}
5448
5449static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5450{
5451 struct io_sr_msg *sr = &req->sr_msg;
5452
d2b6f48b
PB
5453 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5454 return -EINVAL;
5455
270a5940 5456 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 5457 sr->len = READ_ONCE(sqe->len);
bcda7baa 5458 sr->bgid = READ_ONCE(sqe->buf_group);
04411806
PB
5459 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5460 if (sr->msg_flags & MSG_DONTWAIT)
5461 req->flags |= REQ_F_NOWAIT;
06b76d44 5462
d8768362
JA
5463#ifdef CONFIG_COMPAT
5464 if (req->ctx->compat)
5465 sr->msg_flags |= MSG_CMSG_COMPAT;
5466#endif
7ba89d2a 5467 sr->done_io = 0;
93642ef8 5468 return 0;
aa1fa28f
JA
5469}
5470
7ba89d2a
JA
5471static bool io_net_retry(struct socket *sock, int flags)
5472{
5473 if (!(flags & MSG_WAITALL))
5474 return false;
5475 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
5476}
5477
889fca73 5478static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 5479{
6b754c8b 5480 struct io_async_msghdr iomsg, *kmsg;
7ba89d2a 5481 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 5482 struct socket *sock;
7fbb1b54 5483 struct io_buffer *kbuf;
7a7cacba 5484 unsigned flags;
d1fd1c20 5485 int ret, min_ret = 0;
45d189c6 5486 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
03b1230c 5487
dba4a925 5488 sock = sock_from_file(req->file);
7a7cacba 5489 if (unlikely(!sock))
dba4a925 5490 return -ENOTSOCK;
3529d8c2 5491
d886e185
PB
5492 if (req_has_async_data(req)) {
5493 kmsg = req->async_data;
5494 } else {
7a7cacba
PB
5495 ret = io_recvmsg_copy_hdr(req, &iomsg);
5496 if (ret)
681fda8d 5497 return ret;
7a7cacba
PB
5498 kmsg = &iomsg;
5499 }
03b1230c 5500
bc02ef33 5501 if (req->flags & REQ_F_BUFFER_SELECT) {
51aac424 5502 kbuf = io_recv_buffer_select(req, issue_flags);
bc02ef33 5503 if (IS_ERR(kbuf))
52de1fe1 5504 return PTR_ERR(kbuf);
7a7cacba 5505 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5476dfed
PB
5506 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
5507 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
7a7cacba
PB
5508 1, req->sr_msg.len);
5509 }
52de1fe1 5510
04411806
PB
5511 flags = req->sr_msg.msg_flags;
5512 if (force_nonblock)
7a7cacba 5513 flags |= MSG_DONTWAIT;
0031275d
SM
5514 if (flags & MSG_WAITALL)
5515 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5516
7a7cacba
PB
5517 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
5518 kmsg->uaddr, flags);
7297ce3d
PB
5519 if (ret < min_ret) {
5520 if (ret == -EAGAIN && force_nonblock)
5521 return io_setup_async_msg(req, kmsg);
5522 if (ret == -ERESTARTSYS)
5523 ret = -EINTR;
7ba89d2a
JA
5524 if (ret > 0 && io_net_retry(sock, flags)) {
5525 sr->done_io += ret;
8a3e8ee5 5526 req->flags |= REQ_F_PARTIAL_IO;
7ba89d2a
JA
5527 return io_setup_async_msg(req, kmsg);
5528 }
7297ce3d
PB
5529 req_set_fail(req);
5530 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
5531 req_set_fail(req);
5532 }
03b1230c 5533
257e84a5
PB
5534 /* fast path, check for non-NULL to avoid function call */
5535 if (kmsg->free_iov)
5536 kfree(kmsg->free_iov);
99bc4c38 5537 req->flags &= ~REQ_F_NEED_CLEANUP;
7ba89d2a
JA
5538 if (ret >= 0)
5539 ret += sr->done_io;
5540 else if (sr->done_io)
5541 ret = sr->done_io;
cc3cec83 5542 __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
03b1230c 5543 return 0;
0fa03c62 5544}
5d17b4a4 5545
889fca73 5546static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
fddaface 5547{
6b754c8b 5548 struct io_buffer *kbuf;
7a7cacba
PB
5549 struct io_sr_msg *sr = &req->sr_msg;
5550 struct msghdr msg;
5551 void __user *buf = sr->buf;
fddaface 5552 struct socket *sock;
7a7cacba
PB
5553 struct iovec iov;
5554 unsigned flags;
d1fd1c20 5555 int ret, min_ret = 0;
45d189c6 5556 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
fddaface 5557
dba4a925 5558 sock = sock_from_file(req->file);
7a7cacba 5559 if (unlikely(!sock))
dba4a925 5560 return -ENOTSOCK;
fddaface 5561
bc02ef33 5562 if (req->flags & REQ_F_BUFFER_SELECT) {
51aac424 5563 kbuf = io_recv_buffer_select(req, issue_flags);
bcda7baa
JA
5564 if (IS_ERR(kbuf))
5565 return PTR_ERR(kbuf);
7a7cacba 5566 buf = u64_to_user_ptr(kbuf->addr);
bc02ef33 5567 }
bcda7baa 5568
7a7cacba 5569 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
14c32eee
PB
5570 if (unlikely(ret))
5571 goto out_free;
fddaface 5572
7a7cacba
PB
5573 msg.msg_name = NULL;
5574 msg.msg_control = NULL;
5575 msg.msg_controllen = 0;
5576 msg.msg_namelen = 0;
5577 msg.msg_iocb = NULL;
5578 msg.msg_flags = 0;
fddaface 5579
04411806
PB
5580 flags = req->sr_msg.msg_flags;
5581 if (force_nonblock)
7a7cacba 5582 flags |= MSG_DONTWAIT;
0031275d
SM
5583 if (flags & MSG_WAITALL)
5584 min_ret = iov_iter_count(&msg.msg_iter);
5585
7a7cacba 5586 ret = sock_recvmsg(sock, &msg, flags);
7297ce3d
PB
5587 if (ret < min_ret) {
5588 if (ret == -EAGAIN && force_nonblock)
5589 return -EAGAIN;
5590 if (ret == -ERESTARTSYS)
5591 ret = -EINTR;
7ba89d2a
JA
5592 if (ret > 0 && io_net_retry(sock, flags)) {
5593 sr->len -= ret;
5594 sr->buf += ret;
5595 sr->done_io += ret;
8a3e8ee5 5596 req->flags |= REQ_F_PARTIAL_IO;
7ba89d2a
JA
5597 return -EAGAIN;
5598 }
7297ce3d
PB
5599 req_set_fail(req);
5600 } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
0d7c1153 5601out_free:
93d2bcd2 5602 req_set_fail(req);
7297ce3d 5603 }
cc3cec83 5604
7ba89d2a
JA
5605 if (ret >= 0)
5606 ret += sr->done_io;
5607 else if (sr->done_io)
5608 ret = sr->done_io;
cc3cec83 5609 __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
fddaface 5610 return 0;
fddaface
JA
5611}
5612
3529d8c2 5613static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35 5614{
8ed8d3c3
JA
5615 struct io_accept *accept = &req->accept;
5616
14587a46 5617 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
17f2fe35 5618 return -EINVAL;
aaa4db12 5619 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
5620 return -EINVAL;
5621
d55e5f5b
JA
5622 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5623 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 5624 accept->flags = READ_ONCE(sqe->accept_flags);
09952e3e 5625 accept->nofile = rlimit(RLIMIT_NOFILE);
a7083ad5 5626
aaa4db12 5627 accept->file_slot = READ_ONCE(sqe->file_index);
adf3a9e9 5628 if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
aaa4db12 5629 return -EINVAL;
a7083ad5
PB
5630 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5631 return -EINVAL;
5632 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5633 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
8ed8d3c3 5634 return 0;
8ed8d3c3 5635}
17f2fe35 5636
889fca73 5637static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3
JA
5638{
5639 struct io_accept *accept = &req->accept;
45d189c6 5640 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ac45abc0 5641 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
aaa4db12 5642 bool fixed = !!accept->file_slot;
a7083ad5
PB
5643 struct file *file;
5644 int ret, fd;
8ed8d3c3 5645
aaa4db12
PB
5646 if (!fixed) {
5647 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5648 if (unlikely(fd < 0))
5649 return fd;
5650 }
a7083ad5
PB
5651 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5652 accept->flags);
5653 if (IS_ERR(file)) {
aaa4db12
PB
5654 if (!fixed)
5655 put_unused_fd(fd);
a7083ad5
PB
5656 ret = PTR_ERR(file);
5657 if (ret == -EAGAIN && force_nonblock)
5658 return -EAGAIN;
ac45abc0
PB
5659 if (ret == -ERESTARTSYS)
5660 ret = -EINTR;
93d2bcd2 5661 req_set_fail(req);
aaa4db12 5662 } else if (!fixed) {
a7083ad5
PB
5663 fd_install(fd, file);
5664 ret = fd;
aaa4db12
PB
5665 } else {
5666 ret = io_install_fixed_file(req, file, issue_flags,
5667 accept->file_slot - 1);
ac45abc0 5668 }
889fca73 5669 __io_req_complete(req, issue_flags, ret, 0);
17f2fe35 5670 return 0;
8ed8d3c3
JA
5671}
5672
93642ef8
PB
5673static int io_connect_prep_async(struct io_kiocb *req)
5674{
5675 struct io_async_connect *io = req->async_data;
5676 struct io_connect *conn = &req->connect;
5677
5678 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5679}
5680
3529d8c2 5681static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021 5682{
3529d8c2 5683 struct io_connect *conn = &req->connect;
f499a021 5684
14587a46 5685 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3fbb51c1 5686 return -EINVAL;
26578cda
PB
5687 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5688 sqe->splice_fd_in)
3fbb51c1
JA
5689 return -EINVAL;
5690
3529d8c2
JA
5691 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5692 conn->addr_len = READ_ONCE(sqe->addr2);
93642ef8 5693 return 0;
f499a021
JA
5694}
5695
889fca73 5696static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
f8e85cf2 5697{
e8c2bc1f 5698 struct io_async_connect __io, *io;
f8e85cf2 5699 unsigned file_flags;
3fbb51c1 5700 int ret;
45d189c6 5701 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
f8e85cf2 5702
d886e185 5703 if (req_has_async_data(req)) {
e8c2bc1f 5704 io = req->async_data;
f499a021 5705 } else {
3529d8c2
JA
5706 ret = move_addr_to_kernel(req->connect.addr,
5707 req->connect.addr_len,
e8c2bc1f 5708 &__io.address);
f499a021
JA
5709 if (ret)
5710 goto out;
5711 io = &__io;
5712 }
5713
3fbb51c1
JA
5714 file_flags = force_nonblock ? O_NONBLOCK : 0;
5715
e8c2bc1f 5716 ret = __sys_connect_file(req->file, &io->address,
3fbb51c1 5717 req->connect.addr_len, file_flags);
87f80d62 5718 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
d886e185 5719 if (req_has_async_data(req))
b7bb4f7d 5720 return -EAGAIN;
e8c2bc1f 5721 if (io_alloc_async_data(req)) {
f499a021
JA
5722 ret = -ENOMEM;
5723 goto out;
5724 }
e8c2bc1f 5725 memcpy(req->async_data, &__io, sizeof(__io));
f8e85cf2 5726 return -EAGAIN;
f499a021 5727 }
f8e85cf2
JA
5728 if (ret == -ERESTARTSYS)
5729 ret = -EINTR;
f499a021 5730out:
4e88d6e7 5731 if (ret < 0)
93d2bcd2 5732 req_set_fail(req);
889fca73 5733 __io_req_complete(req, issue_flags, ret, 0);
f8e85cf2 5734 return 0;
469956e8
Y
5735}
5736#else /* !CONFIG_NET */
99a10081
JA
5737#define IO_NETOP_FN(op) \
5738static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
5739{ \
5740 return -EOPNOTSUPP; \
5741}
5742
5743#define IO_NETOP_PREP(op) \
5744IO_NETOP_FN(op) \
5745static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5746{ \
5747 return -EOPNOTSUPP; \
5748} \
5749
5750#define IO_NETOP_PREP_ASYNC(op) \
5751IO_NETOP_PREP(op) \
5752static int io_##op##_prep_async(struct io_kiocb *req) \
5753{ \
5754 return -EOPNOTSUPP; \
5755}
5756
5757IO_NETOP_PREP_ASYNC(sendmsg);
5758IO_NETOP_PREP_ASYNC(recvmsg);
5759IO_NETOP_PREP_ASYNC(connect);
5760IO_NETOP_PREP(accept);
5761IO_NETOP_FN(send);
5762IO_NETOP_FN(recv);
469956e8 5763#endif /* CONFIG_NET */
f8e85cf2 5764
d7718a9d
JA
5765struct io_poll_table {
5766 struct poll_table_struct pt;
5767 struct io_kiocb *req;
68b11e8b 5768 int nr_entries;
d7718a9d
JA
5769 int error;
5770};
ce593a6c 5771
aa43477b 5772#define IO_POLL_CANCEL_FLAG BIT(31)
e2c0cb7c 5773#define IO_POLL_REF_MASK GENMASK(30, 0)
6d816e08 5774
aa43477b
PB
5775/*
5776 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
5777 * bump it and acquire ownership. It's disallowed to modify requests while not
5778 * owning it, that prevents from races for enqueueing task_work's and b/w
5779 * arming poll and wakeups.
5780 */
5781static inline bool io_poll_get_ownership(struct io_kiocb *req)
5782{
5783 return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
d7718a9d
JA
5784}
5785
aa43477b 5786static void io_poll_mark_cancelled(struct io_kiocb *req)
74ce6ce4 5787{
aa43477b 5788 atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
74ce6ce4
JA
5789}
5790
d4e7cd36 5791static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
18bceab1 5792{
e8c2bc1f 5793 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
d4e7cd36 5794 if (req->opcode == IORING_OP_POLL_ADD)
e8c2bc1f 5795 return req->async_data;
d4e7cd36
JA
5796 return req->apoll->double_poll;
5797}
5798
5799static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5800{
5801 if (req->opcode == IORING_OP_POLL_ADD)
5802 return &req->poll;
5803 return &req->apoll->poll;
5804}
5805
5641897a 5806static void io_poll_req_insert(struct io_kiocb *req)
d4e7cd36 5807{
5641897a
PB
5808 struct io_ring_ctx *ctx = req->ctx;
5809 struct hlist_head *list;
18bceab1 5810
cef216fc 5811 list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
5641897a 5812 hlist_add_head(&req->hash_node, list);
18bceab1
JA
5813}
5814
5641897a
PB
5815static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5816 wait_queue_func_t wake_func)
18bceab1 5817{
5641897a 5818 poll->head = NULL;
5641897a
PB
5819#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5820 /* mask in events that we always want/need */
5821 poll->events = events | IO_POLL_UNMASK;
5822 INIT_LIST_HEAD(&poll->wait.entry);
5823 init_waitqueue_func_entry(&poll->wait, wake_func);
18bceab1
JA
5824}
5825
aa43477b 5826static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
18bceab1 5827{
791f3465 5828 struct wait_queue_head *head = smp_load_acquire(&poll->head);
18bceab1 5829
791f3465
PB
5830 if (head) {
5831 spin_lock_irq(&head->lock);
5832 list_del_init(&poll->wait.entry);
5833 poll->head = NULL;
5834 spin_unlock_irq(&head->lock);
5835 }
aa43477b 5836}
18bceab1 5837
aa43477b
PB
5838static void io_poll_remove_entries(struct io_kiocb *req)
5839{
91eac1c6
JA
5840 /*
5841 * Nothing to do if neither of those flags are set. Avoid dipping
5842 * into the poll/apoll/double cachelines if we can.
5843 */
5844 if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
5845 return;
18bceab1 5846
791f3465
PB
5847 /*
5848 * While we hold the waitqueue lock and the waitqueue is nonempty,
5849 * wake_up_pollfree() will wait for us. However, taking the waitqueue
5850 * lock in the first place can race with the waitqueue being freed.
5851 *
5852 * We solve this as eventpoll does: by taking advantage of the fact that
5853 * all users of wake_up_pollfree() will RCU-delay the actual free. If
5854 * we enter rcu_read_lock() and see that the pointer to the queue is
5855 * non-NULL, we can then lock it without the memory being freed out from
5856 * under us.
5857 *
5858 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
5859 * case the caller deletes the entry from the queue, leaving it empty.
5860 * In that case, only RCU prevents the queue memory from being freed.
5861 */
5862 rcu_read_lock();
91eac1c6
JA
5863 if (req->flags & REQ_F_SINGLE_POLL)
5864 io_poll_remove_entry(io_poll_get_single(req));
5865 if (req->flags & REQ_F_DOUBLE_POLL)
5866 io_poll_remove_entry(io_poll_get_double(req));
791f3465 5867 rcu_read_unlock();
18bceab1
JA
5868}
5869
aa43477b
PB
5870/*
5871 * All poll tw should go through this. Checks for poll events, manages
5872 * references, does rewait, etc.
5873 *
5874 * Returns a negative error on failure. >0 when no action require, which is
5875 * either spurious wakeup or multishot CQE is served. 0 when it's done with
cef216fc 5876 * the request, then the mask is stored in req->cqe.res.
aa43477b 5877 */
5106dd6e 5878static int io_poll_check_events(struct io_kiocb *req, bool locked)
18bceab1 5879{
74ce6ce4 5880 struct io_ring_ctx *ctx = req->ctx;
aa43477b 5881 int v;
18bceab1 5882
316319e8 5883 /* req->task == current here, checking PF_EXITING is safe */
e09ee510 5884 if (unlikely(req->task->flags & PF_EXITING))
f2219057 5885 return -ECANCELED;
18bceab1 5886
aa43477b
PB
5887 do {
5888 v = atomic_read(&req->poll_refs);
74ce6ce4 5889
aa43477b
PB
5890 /* tw handler should be the owner, and so have some references */
5891 if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
5892 return 0;
5893 if (v & IO_POLL_CANCEL_FLAG)
5894 return -ECANCELED;
8706e04e 5895
cef216fc 5896 if (!req->cqe.res) {
2804ecd8 5897 struct poll_table_struct pt = { ._key = req->apoll_events };
cce64ef0 5898 unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
18bceab1 5899
cce64ef0 5900 if (unlikely(!io_assign_file(req, flags)))
7179c3ce 5901 return -EBADF;
cef216fc 5902 req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
c8b5e260 5903 }
74ce6ce4 5904
aa43477b 5905 /* multishot, just fill an CQE and proceed */
cef216fc
PB
5906 if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) {
5907 __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events);
aa43477b 5908 bool filled;
18bceab1 5909
aa43477b 5910 spin_lock(&ctx->completion_lock);
cef216fc 5911 filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask,
aa43477b
PB
5912 IORING_CQE_F_MORE);
5913 io_commit_cqring(ctx);
5914 spin_unlock(&ctx->completion_lock);
5915 if (unlikely(!filled))
5916 return -ECANCELED;
5917 io_cqring_ev_posted(ctx);
cef216fc 5918 } else if (req->cqe.res) {
aa43477b
PB
5919 return 0;
5920 }
18bceab1 5921
aa43477b
PB
5922 /*
5923 * Release all references, retry if someone tried to restart
5924 * task_work while we were executing it.
5925 */
5926 } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
18bceab1 5927
18bceab1
JA
5928 return 1;
5929}
5930
aa43477b 5931static void io_poll_task_func(struct io_kiocb *req, bool *locked)
18bceab1 5932{
18bceab1 5933 struct io_ring_ctx *ctx = req->ctx;
aa43477b 5934 int ret;
18bceab1 5935
5106dd6e 5936 ret = io_poll_check_events(req, *locked);
aa43477b
PB
5937 if (ret > 0)
5938 return;
5939
5940 if (!ret) {
cef216fc 5941 req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
e27414be 5942 } else {
cef216fc 5943 req->cqe.res = ret;
aa43477b 5944 req_set_fail(req);
a62682f9 5945 }
aa43477b
PB
5946
5947 io_poll_remove_entries(req);
5948 spin_lock(&ctx->completion_lock);
5949 hash_del(&req->hash_node);
cef216fc 5950 __io_req_complete_post(req, req->cqe.res, 0);
aa43477b
PB
5951 io_commit_cqring(ctx);
5952 spin_unlock(&ctx->completion_lock);
5953 io_cqring_ev_posted(ctx);
18bceab1
JA
5954}
5955
aa43477b 5956static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
18bceab1
JA
5957{
5958 struct io_ring_ctx *ctx = req->ctx;
aa43477b 5959 int ret;
18bceab1 5960
5106dd6e 5961 ret = io_poll_check_events(req, *locked);
aa43477b
PB
5962 if (ret > 0)
5963 return;
18bceab1 5964
aa43477b
PB
5965 io_poll_remove_entries(req);
5966 spin_lock(&ctx->completion_lock);
5967 hash_del(&req->hash_node);
5968 spin_unlock(&ctx->completion_lock);
18bceab1 5969
aa43477b
PB
5970 if (!ret)
5971 io_req_task_submit(req, locked);
5972 else
5973 io_req_complete_failed(req, ret);
18bceab1
JA
5974}
5975
81459350 5976static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
aa43477b 5977{
cef216fc 5978 req->cqe.res = mask;
81459350
JA
5979 /*
5980 * This is useful for poll that is armed on behalf of another
5981 * request, and where the wakeup path could be on a different
5982 * CPU. We want to avoid pulling in req->apoll->events for that
5983 * case.
5984 */
2804ecd8 5985 req->apoll_events = events;
aa43477b
PB
5986 if (req->opcode == IORING_OP_POLL_ADD)
5987 req->io_task_work.func = io_poll_task_func;
5988 else
5989 req->io_task_work.func = io_apoll_task_func;
5990
cef216fc 5991 trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
aa43477b
PB
5992 io_req_task_work_add(req, false);
5993}
5994
81459350 5995static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
aa43477b
PB
5996{
5997 if (io_poll_get_ownership(req))
81459350 5998 __io_poll_execute(req, res, events);
aa43477b
PB
5999}
6000
6001static void io_poll_cancel_req(struct io_kiocb *req)
6002{
6003 io_poll_mark_cancelled(req);
6004 /* kick tw, which should complete the request */
81459350 6005 io_poll_execute(req, 0, 0);
aa43477b
PB
6006}
6007
d89a4fac
JA
6008#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
6009#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
6010
aa43477b
PB
6011static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
6012 void *key)
18bceab1 6013{
d89a4fac 6014 struct io_kiocb *req = wqe_to_req(wait);
aa43477b
PB
6015 struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
6016 wait);
18bceab1
JA
6017 __poll_t mask = key_to_poll(key);
6018
791f3465
PB
6019 if (unlikely(mask & POLLFREE)) {
6020 io_poll_mark_cancelled(req);
6021 /* we have to kick tw in case it's not already */
81459350 6022 io_poll_execute(req, 0, poll->events);
791f3465
PB
6023
6024 /*
6025 * If the waitqueue is being freed early but someone is already
6026 * holds ownership over it, we have to tear down the request as
6027 * best we can. That means immediately removing the request from
6028 * its waitqueue and preventing all further accesses to the
6029 * waitqueue via the request.
6030 */
6031 list_del_init(&poll->wait.entry);
6032
6033 /*
6034 * Careful: this *must* be the last step, since as soon
6035 * as req->head is NULL'ed out, the request can be
6036 * completed and freed, since aio_poll_complete_work()
6037 * will no longer need to take the waitqueue lock.
6038 */
6039 smp_store_release(&poll->head, NULL);
6040 return 1;
6041 }
6042
aa43477b 6043 /* for instances that support it check for an event match first */
18bceab1
JA
6044 if (mask && !(mask & poll->events))
6045 return 0;
6046
eb0089d6
PB
6047 if (io_poll_get_ownership(req)) {
6048 /* optional, saves extra locking for removal in tw handler */
6049 if (mask && poll->events & EPOLLONESHOT) {
6050 list_del_init(&poll->wait.entry);
6051 poll->head = NULL;
d89a4fac
JA
6052 if (wqe_is_double(wait))
6053 req->flags &= ~REQ_F_DOUBLE_POLL;
6054 else
6055 req->flags &= ~REQ_F_SINGLE_POLL;
eb0089d6 6056 }
81459350 6057 __io_poll_execute(req, mask, poll->events);
eb0089d6 6058 }
18bceab1 6059 return 1;
18bceab1
JA
6060}
6061
6062static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
807abcb0
JA
6063 struct wait_queue_head *head,
6064 struct io_poll_iocb **poll_ptr)
18bceab1
JA
6065{
6066 struct io_kiocb *req = pt->req;
d89a4fac 6067 unsigned long wqe_private = (unsigned long) req;
18bceab1
JA
6068
6069 /*
68b11e8b
PB
6070 * The file being polled uses multiple waitqueues for poll handling
6071 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
6072 * if this happens.
18bceab1 6073 */
68b11e8b 6074 if (unlikely(pt->nr_entries)) {
aa43477b 6075 struct io_poll_iocb *first = poll;
58852d4d 6076
23a65db8 6077 /* double add on the same waitqueue head, ignore */
aa43477b 6078 if (first->head == head)
23a65db8 6079 return;
18bceab1 6080 /* already have a 2nd entry, fail a third attempt */
807abcb0 6081 if (*poll_ptr) {
23a65db8
PB
6082 if ((*poll_ptr)->head == head)
6083 return;
18bceab1
JA
6084 pt->error = -EINVAL;
6085 return;
6086 }
aa43477b 6087
18bceab1
JA
6088 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
6089 if (!poll) {
6090 pt->error = -ENOMEM;
6091 return;
6092 }
d89a4fac
JA
6093 /* mark as double wq entry */
6094 wqe_private |= 1;
91eac1c6 6095 req->flags |= REQ_F_DOUBLE_POLL;
aa43477b 6096 io_init_poll_iocb(poll, first->events, first->wait.func);
807abcb0 6097 *poll_ptr = poll;
d886e185
PB
6098 if (req->opcode == IORING_OP_POLL_ADD)
6099 req->flags |= REQ_F_ASYNC_DATA;
18bceab1
JA
6100 }
6101
91eac1c6 6102 req->flags |= REQ_F_SINGLE_POLL;
68b11e8b 6103 pt->nr_entries++;
18bceab1 6104 poll->head = head;
d89a4fac 6105 poll->wait.private = (void *) wqe_private;
a31eb4a2
JX
6106
6107 if (poll->events & EPOLLEXCLUSIVE)
6108 add_wait_queue_exclusive(head, &poll->wait);
6109 else
6110 add_wait_queue(head, &poll->wait);
18bceab1
JA
6111}
6112
aa43477b 6113static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
18bceab1
JA
6114 struct poll_table_struct *p)
6115{
6116 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
d7718a9d 6117
aa43477b
PB
6118 __io_queue_proc(&pt->req->poll, pt, head,
6119 (struct io_poll_iocb **) &pt->req->async_data);
d7718a9d
JA
6120}
6121
aa43477b
PB
6122static int __io_arm_poll_handler(struct io_kiocb *req,
6123 struct io_poll_iocb *poll,
6124 struct io_poll_table *ipt, __poll_t mask)
d7718a9d
JA
6125{
6126 struct io_ring_ctx *ctx = req->ctx;
aa43477b 6127 int v;
d7718a9d 6128
4d52f338 6129 INIT_HLIST_NODE(&req->hash_node);
aa43477b 6130 io_init_poll_iocb(poll, mask, io_poll_wake);
b90cd197 6131 poll->file = req->file;
d7718a9d
JA
6132
6133 ipt->pt._key = mask;
6134 ipt->req = req;
68b11e8b
PB
6135 ipt->error = 0;
6136 ipt->nr_entries = 0;
d7718a9d 6137
aa43477b
PB
6138 /*
6139 * Take the ownership to delay any tw execution up until we're done
6140 * with poll arming. see io_poll_get_ownership().
6141 */
6142 atomic_set(&req->poll_refs, 1);
d7718a9d 6143 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
aa43477b
PB
6144
6145 if (mask && (poll->events & EPOLLONESHOT)) {
6146 io_poll_remove_entries(req);
6147 /* no one else has access to the req, forget about the ref */
6148 return mask;
6149 }
6150 if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
6151 io_poll_remove_entries(req);
6152 if (!ipt->error)
6153 ipt->error = -EINVAL;
6154 return 0;
6155 }
d7718a9d 6156
79ebeaee 6157 spin_lock(&ctx->completion_lock);
aa43477b
PB
6158 io_poll_req_insert(req);
6159 spin_unlock(&ctx->completion_lock);
6160
6161 if (mask) {
6162 /* can't multishot if failed, just queue the event we've got */
6163 if (unlikely(ipt->error || !ipt->nr_entries))
6164 poll->events |= EPOLLONESHOT;
81459350 6165 __io_poll_execute(req, mask, poll->events);
aa43477b 6166 return 0;
d7718a9d
JA
6167 }
6168
aa43477b
PB
6169 /*
6170 * Release ownership. If someone tried to queue a tw while it was
6171 * locked, kick it off for them.
6172 */
6173 v = atomic_dec_return(&req->poll_refs);
6174 if (unlikely(v & IO_POLL_REF_MASK))
81459350 6175 __io_poll_execute(req, 0, poll->events);
aa43477b
PB
6176 return 0;
6177}
6178
6179static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
6180 struct poll_table_struct *p)
6181{
6182 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
6183 struct async_poll *apoll = pt->req->apoll;
6184
6185 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
d7718a9d
JA
6186}
6187
59b735ae
OL
6188enum {
6189 IO_APOLL_OK,
6190 IO_APOLL_ABORTED,
6191 IO_APOLL_READY
6192};
6193
4d9237e3 6194static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
d7718a9d
JA
6195{
6196 const struct io_op_def *def = &io_op_defs[req->opcode];
6197 struct io_ring_ctx *ctx = req->ctx;
6198 struct async_poll *apoll;
6199 struct io_poll_table ipt;
aa43477b
PB
6200 __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
6201 int ret;
d7718a9d 6202
b2d9c3da
PB
6203 if (!def->pollin && !def->pollout)
6204 return IO_APOLL_ABORTED;
658d0a40
PB
6205 if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
6206 return IO_APOLL_ABORTED;
b2d9c3da
PB
6207
6208 if (def->pollin) {
b2d9c3da
PB
6209 mask |= POLLIN | POLLRDNORM;
6210
6211 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
6212 if ((req->opcode == IORING_OP_RECVMSG) &&
6213 (req->sr_msg.msg_flags & MSG_ERRQUEUE))
6214 mask &= ~POLLIN;
6215 } else {
b2d9c3da
PB
6216 mask |= POLLOUT | POLLWRNORM;
6217 }
52dd8640
DY
6218 if (def->poll_exclusive)
6219 mask |= EPOLLEXCLUSIVE;
4d9237e3
JA
6220 if (!(issue_flags & IO_URING_F_UNLOCKED) &&
6221 !list_empty(&ctx->apoll_cache)) {
6222 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
6223 poll.wait.entry);
6224 list_del_init(&apoll->poll.wait.entry);
6225 } else {
6226 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
6227 if (unlikely(!apoll))
6228 return IO_APOLL_ABORTED;
6229 }
807abcb0 6230 apoll->double_poll = NULL;
d7718a9d 6231 req->apoll = apoll;
b2d9c3da 6232 req->flags |= REQ_F_POLLED;
d7718a9d
JA
6233 ipt.pt._qproc = io_async_queue_proc;
6234
4d55f238 6235 io_kbuf_recycle(req, issue_flags);
abdad709 6236
aa43477b 6237 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
41a5169c
HX
6238 if (ret || ipt.error)
6239 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
6240
cef216fc 6241 trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
236daeae 6242 mask, apoll->poll.events);
59b735ae 6243 return IO_APOLL_OK;
d7718a9d
JA
6244}
6245
76e1b642
JA
6246/*
6247 * Returns true if we found and killed one or more poll requests
6248 */
c072481d
PB
6249static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
6250 struct task_struct *tsk, bool cancel_all)
221c5eb2 6251{
78076bb6 6252 struct hlist_node *tmp;
221c5eb2 6253 struct io_kiocb *req;
aa43477b
PB
6254 bool found = false;
6255 int i;
221c5eb2 6256
79ebeaee 6257 spin_lock(&ctx->completion_lock);
78076bb6
JA
6258 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
6259 struct hlist_head *list;
6260
6261 list = &ctx->cancel_hash[i];
f3606e3a 6262 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
42a7b4ed 6263 if (io_match_task_safe(req, tsk, cancel_all)) {
61bc84c4 6264 hlist_del_init(&req->hash_node);
aa43477b
PB
6265 io_poll_cancel_req(req);
6266 found = true;
6267 }
f3606e3a 6268 }
221c5eb2 6269 }
79ebeaee 6270 spin_unlock(&ctx->completion_lock);
aa43477b 6271 return found;
221c5eb2
JA
6272}
6273
9ba5fac8
PB
6274static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
6275 bool poll_only)
e07785b0 6276 __must_hold(&ctx->completion_lock)
47f46768 6277{
78076bb6 6278 struct hlist_head *list;
47f46768
JA
6279 struct io_kiocb *req;
6280
78076bb6
JA
6281 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
6282 hlist_for_each_entry(req, list, hash_node) {
cef216fc 6283 if (sqe_addr != req->cqe.user_data)
b41e9852 6284 continue;
9ba5fac8
PB
6285 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
6286 continue;
b2cb805f 6287 return req;
47f46768 6288 }
b2cb805f
JA
6289 return NULL;
6290}
6291
aa43477b
PB
6292static bool io_poll_disarm(struct io_kiocb *req)
6293 __must_hold(&ctx->completion_lock)
6294{
6295 if (!io_poll_get_ownership(req))
6296 return false;
6297 io_poll_remove_entries(req);
6298 hash_del(&req->hash_node);
6299 return true;
6300}
6301
9ba5fac8
PB
6302static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
6303 bool poll_only)
e07785b0 6304 __must_hold(&ctx->completion_lock)
b2cb805f 6305{
aa43477b 6306 struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
b2cb805f 6307
b2cb805f
JA
6308 if (!req)
6309 return -ENOENT;
aa43477b
PB
6310 io_poll_cancel_req(req);
6311 return 0;
47f46768
JA
6312}
6313
9096af3e
PB
6314static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
6315 unsigned int flags)
6316{
6317 u32 events;
47f46768 6318
9096af3e
PB
6319 events = READ_ONCE(sqe->poll32_events);
6320#ifdef __BIG_ENDIAN
6321 events = swahw32(events);
6322#endif
6323 if (!(flags & IORING_POLL_ADD_MULTI))
6324 events |= EPOLLONESHOT;
6325 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
47f46768
JA
6326}
6327
c5de0036 6328static int io_poll_update_prep(struct io_kiocb *req,
3529d8c2 6329 const struct io_uring_sqe *sqe)
0969e783 6330{
c5de0036
PB
6331 struct io_poll_update *upd = &req->poll_update;
6332 u32 flags;
6333
0969e783
JA
6334 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6335 return -EINVAL;
26578cda 6336 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
c5de0036
PB
6337 return -EINVAL;
6338 flags = READ_ONCE(sqe->len);
6339 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
6340 IORING_POLL_ADD_MULTI))
6341 return -EINVAL;
6342 /* meaningless without update */
6343 if (flags == IORING_POLL_ADD_MULTI)
0969e783
JA
6344 return -EINVAL;
6345
c5de0036
PB
6346 upd->old_user_data = READ_ONCE(sqe->addr);
6347 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
6348 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
221c5eb2 6349
c5de0036
PB
6350 upd->new_user_data = READ_ONCE(sqe->off);
6351 if (!upd->update_user_data && upd->new_user_data)
6352 return -EINVAL;
6353 if (upd->update_events)
6354 upd->events = io_poll_parse_events(sqe, flags);
6355 else if (sqe->poll32_events)
6356 return -EINVAL;
221c5eb2 6357
221c5eb2
JA
6358 return 0;
6359}
6360
3529d8c2 6361static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
6362{
6363 struct io_poll_iocb *poll = &req->poll;
c5de0036 6364 u32 flags;
221c5eb2
JA
6365
6366 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6367 return -EINVAL;
c5de0036 6368 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
88e41cf9
JA
6369 return -EINVAL;
6370 flags = READ_ONCE(sqe->len);
c5de0036 6371 if (flags & ~IORING_POLL_ADD_MULTI)
221c5eb2 6372 return -EINVAL;
04c76b41
PB
6373 if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
6374 return -EINVAL;
221c5eb2 6375
48dcd38d 6376 io_req_set_refcount(req);
2804ecd8 6377 req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
0969e783
JA
6378 return 0;
6379}
6380
61e98203 6381static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
0969e783
JA
6382{
6383 struct io_poll_iocb *poll = &req->poll;
0969e783 6384 struct io_poll_table ipt;
aa43477b 6385 int ret;
0969e783 6386
d7718a9d 6387 ipt.pt._qproc = io_poll_queue_proc;
36703247 6388
aa43477b
PB
6389 ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
6390 ret = ret ?: ipt.error;
6391 if (ret)
6392 __io_req_complete(req, issue_flags, ret, 0);
6393 return 0;
221c5eb2
JA
6394}
6395
c5de0036 6396static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
b69de288
JA
6397{
6398 struct io_ring_ctx *ctx = req->ctx;
6399 struct io_kiocb *preq;
2bbb146d 6400 int ret2, ret = 0;
cc8e9ba7 6401 bool locked;
b69de288 6402
79ebeaee 6403 spin_lock(&ctx->completion_lock);
9ba5fac8 6404 preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
aa43477b 6405 if (!preq || !io_poll_disarm(preq)) {
79ebeaee 6406 spin_unlock(&ctx->completion_lock);
aa43477b 6407 ret = preq ? -EALREADY : -ENOENT;
2bbb146d 6408 goto out;
b69de288 6409 }
79ebeaee 6410 spin_unlock(&ctx->completion_lock);
cb3b200e 6411
2bbb146d
PB
6412 if (req->poll_update.update_events || req->poll_update.update_user_data) {
6413 /* only mask one event flags, keep behavior flags */
6414 if (req->poll_update.update_events) {
6415 preq->poll.events &= ~0xffff;
6416 preq->poll.events |= req->poll_update.events & 0xffff;
6417 preq->poll.events |= IO_POLL_UNMASK;
cb3b200e 6418 }
2bbb146d 6419 if (req->poll_update.update_user_data)
cef216fc 6420 preq->cqe.user_data = req->poll_update.new_user_data;
b69de288 6421
2bbb146d
PB
6422 ret2 = io_poll_add(preq, issue_flags);
6423 /* successfully updated, don't complete poll request */
6424 if (!ret2)
6425 goto out;
b69de288 6426 }
6224590d 6427
2bbb146d 6428 req_set_fail(preq);
cef216fc 6429 preq->cqe.res = -ECANCELED;
cc8e9ba7
PB
6430 locked = !(issue_flags & IO_URING_F_UNLOCKED);
6431 io_req_task_complete(preq, &locked);
2bbb146d
PB
6432out:
6433 if (ret < 0)
6224590d 6434 req_set_fail(req);
2bbb146d 6435 /* complete update request, we're done with it */
cc8e9ba7 6436 __io_req_complete(req, issue_flags, ret, 0);
b69de288 6437 return 0;
89850fce
JA
6438}
6439
5262f567
JA
6440static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
6441{
ad8a48ac
JA
6442 struct io_timeout_data *data = container_of(timer,
6443 struct io_timeout_data, timer);
6444 struct io_kiocb *req = data->req;
6445 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
6446 unsigned long flags;
6447
89850fce 6448 spin_lock_irqsave(&ctx->timeout_lock, flags);
a71976f3 6449 list_del_init(&req->timeout.list);
01cec8c1
PB
6450 atomic_set(&req->ctx->cq_timeouts,
6451 atomic_read(&req->ctx->cq_timeouts) + 1);
89850fce 6452 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
01cec8c1 6453
a90c8bf6
PB
6454 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
6455 req_set_fail(req);
6456
cef216fc 6457 req->cqe.res = -ETIME;
a90c8bf6 6458 req->io_task_work.func = io_req_task_complete;
4813c377 6459 io_req_task_work_add(req, false);
5262f567
JA
6460 return HRTIMER_NORESTART;
6461}
6462
fbd15848
PB
6463static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
6464 __u64 user_data)
89850fce 6465 __must_hold(&ctx->timeout_lock)
f254ac04 6466{
fbd15848 6467 struct io_timeout_data *io;
47f46768 6468 struct io_kiocb *req;
fd9c7bc5 6469 bool found = false;
f254ac04 6470
135fcde8 6471 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
cef216fc 6472 found = user_data == req->cqe.user_data;
fd9c7bc5 6473 if (found)
47f46768 6474 break;
47f46768 6475 }
fd9c7bc5
PB
6476 if (!found)
6477 return ERR_PTR(-ENOENT);
fbd15848
PB
6478
6479 io = req->async_data;
fd9c7bc5 6480 if (hrtimer_try_to_cancel(&io->timer) == -1)
fbd15848 6481 return ERR_PTR(-EALREADY);
a71976f3 6482 list_del_init(&req->timeout.list);
fbd15848
PB
6483 return req;
6484}
47f46768 6485
fbd15848 6486static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
ec3c3d0f 6487 __must_hold(&ctx->completion_lock)
89850fce 6488 __must_hold(&ctx->timeout_lock)
fbd15848
PB
6489{
6490 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6491
6492 if (IS_ERR(req))
6493 return PTR_ERR(req);
6695490d 6494 io_req_task_queue_fail(req, -ECANCELED);
f254ac04
JA
6495 return 0;
6496}
6497
50c1df2b
JA
6498static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
6499{
6500 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
6501 case IORING_TIMEOUT_BOOTTIME:
6502 return CLOCK_BOOTTIME;
6503 case IORING_TIMEOUT_REALTIME:
6504 return CLOCK_REALTIME;
6505 default:
6506 /* can't happen, vetted at prep time */
6507 WARN_ON_ONCE(1);
6508 fallthrough;
6509 case 0:
6510 return CLOCK_MONOTONIC;
6511 }
6512}
6513
f1042b6c
PB
6514static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6515 struct timespec64 *ts, enum hrtimer_mode mode)
6516 __must_hold(&ctx->timeout_lock)
6517{
6518 struct io_timeout_data *io;
6519 struct io_kiocb *req;
6520 bool found = false;
6521
6522 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
cef216fc 6523 found = user_data == req->cqe.user_data;
f1042b6c
PB
6524 if (found)
6525 break;
6526 }
6527 if (!found)
6528 return -ENOENT;
6529
6530 io = req->async_data;
6531 if (hrtimer_try_to_cancel(&io->timer) == -1)
6532 return -EALREADY;
6533 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
6534 io->timer.function = io_link_timeout_fn;
6535 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
6536 return 0;
6537}
6538
9c8e11b3
PB
6539static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6540 struct timespec64 *ts, enum hrtimer_mode mode)
89850fce 6541 __must_hold(&ctx->timeout_lock)
47f46768 6542{
9c8e11b3
PB
6543 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6544 struct io_timeout_data *data;
47f46768 6545
9c8e11b3
PB
6546 if (IS_ERR(req))
6547 return PTR_ERR(req);
47f46768 6548
9c8e11b3
PB
6549 req->timeout.off = 0; /* noseq */
6550 data = req->async_data;
6551 list_add_tail(&req->timeout.list, &ctx->timeout_list);
50c1df2b 6552 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
9c8e11b3
PB
6553 data->timer.function = io_timeout_fn;
6554 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
6555 return 0;
47f46768
JA
6556}
6557
3529d8c2
JA
6558static int io_timeout_remove_prep(struct io_kiocb *req,
6559 const struct io_uring_sqe *sqe)
b29472ee 6560{
9c8e11b3
PB
6561 struct io_timeout_rem *tr = &req->timeout_rem;
6562
b29472ee
JA
6563 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6564 return -EINVAL;
61710e43
DA
6565 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6566 return -EINVAL;
26578cda 6567 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
b29472ee
JA
6568 return -EINVAL;
6569
f1042b6c 6570 tr->ltimeout = false;
9c8e11b3
PB
6571 tr->addr = READ_ONCE(sqe->addr);
6572 tr->flags = READ_ONCE(sqe->timeout_flags);
f1042b6c
PB
6573 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
6574 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6575 return -EINVAL;
6576 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
6577 tr->ltimeout = true;
6578 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
9c8e11b3
PB
6579 return -EINVAL;
6580 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6581 return -EFAULT;
2087009c
YB
6582 if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
6583 return -EINVAL;
9c8e11b3
PB
6584 } else if (tr->flags) {
6585 /* timeout removal doesn't support flags */
b29472ee 6586 return -EINVAL;
9c8e11b3 6587 }
b29472ee 6588
b29472ee
JA
6589 return 0;
6590}
6591
8662daec
PB
6592static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6593{
6594 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6595 : HRTIMER_MODE_REL;
6596}
6597
11365043
JA
6598/*
6599 * Remove or update an existing timeout command
6600 */
61e98203 6601static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
11365043 6602{
9c8e11b3 6603 struct io_timeout_rem *tr = &req->timeout_rem;
11365043 6604 struct io_ring_ctx *ctx = req->ctx;
47f46768 6605 int ret;
11365043 6606
ec3c3d0f
PB
6607 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6608 spin_lock(&ctx->completion_lock);
6609 spin_lock_irq(&ctx->timeout_lock);
9c8e11b3 6610 ret = io_timeout_cancel(ctx, tr->addr);
ec3c3d0f
PB
6611 spin_unlock_irq(&ctx->timeout_lock);
6612 spin_unlock(&ctx->completion_lock);
6613 } else {
f1042b6c
PB
6614 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6615
ec3c3d0f 6616 spin_lock_irq(&ctx->timeout_lock);
f1042b6c
PB
6617 if (tr->ltimeout)
6618 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6619 else
6620 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
ec3c3d0f
PB
6621 spin_unlock_irq(&ctx->timeout_lock);
6622 }
11365043 6623
4e88d6e7 6624 if (ret < 0)
93d2bcd2 6625 req_set_fail(req);
505657bc 6626 io_req_complete_post(req, ret, 0);
11365043 6627 return 0;
5262f567
JA
6628}
6629
3529d8c2 6630static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 6631 bool is_timeout_link)
5262f567 6632{
ad8a48ac 6633 struct io_timeout_data *data;
a41525ab 6634 unsigned flags;
56080b02 6635 u32 off = READ_ONCE(sqe->off);
5262f567 6636
ad8a48ac 6637 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 6638 return -EINVAL;
26578cda
PB
6639 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6640 sqe->splice_fd_in)
a41525ab 6641 return -EINVAL;
56080b02 6642 if (off && is_timeout_link)
2d28390a 6643 return -EINVAL;
a41525ab 6644 flags = READ_ONCE(sqe->timeout_flags);
6224590d
PB
6645 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
6646 IORING_TIMEOUT_ETIME_SUCCESS))
50c1df2b
JA
6647 return -EINVAL;
6648 /* more than one clock specified is invalid, obviously */
6649 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
5262f567 6650 return -EINVAL;
bdf20073 6651
ef9dd637 6652 INIT_LIST_HEAD(&req->timeout.list);
bfe68a22 6653 req->timeout.off = off;
f18ee4cf
PB
6654 if (unlikely(off && !req->ctx->off_timeout_used))
6655 req->ctx->off_timeout_used = true;
26a61679 6656
d6a644a7
PB
6657 if (WARN_ON_ONCE(req_has_async_data(req)))
6658 return -EFAULT;
6659 if (io_alloc_async_data(req))
26a61679
JA
6660 return -ENOMEM;
6661
e8c2bc1f 6662 data = req->async_data;
ad8a48ac 6663 data->req = req;
50c1df2b 6664 data->flags = flags;
ad8a48ac
JA
6665
6666 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
6667 return -EFAULT;
6668
f6223ff7
YB
6669 if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
6670 return -EINVAL;
6671
e677edbc 6672 INIT_LIST_HEAD(&req->timeout.list);
8662daec 6673 data->mode = io_translate_timeout_mode(flags);
50c1df2b 6674 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
b97e736a
PB
6675
6676 if (is_timeout_link) {
6677 struct io_submit_link *link = &req->ctx->submit_state.link;
6678
6679 if (!link->head)
6680 return -EINVAL;
6681 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6682 return -EINVAL;
4d13d1a4
PB
6683 req->timeout.head = link->last;
6684 link->last->flags |= REQ_F_ARM_LTIMEOUT;
b97e736a 6685 }
ad8a48ac
JA
6686 return 0;
6687}
6688
61e98203 6689static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
ad8a48ac 6690{
ad8a48ac 6691 struct io_ring_ctx *ctx = req->ctx;
e8c2bc1f 6692 struct io_timeout_data *data = req->async_data;
ad8a48ac 6693 struct list_head *entry;
bfe68a22 6694 u32 tail, off = req->timeout.off;
ad8a48ac 6695
89850fce 6696 spin_lock_irq(&ctx->timeout_lock);
93bd25bb 6697
5262f567
JA
6698 /*
6699 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
6700 * timeout event to be satisfied. If it isn't set, then this is
6701 * a pure timeout request, sequence isn't used.
5262f567 6702 */
8eb7e2d0 6703 if (io_is_timeout_noseq(req)) {
93bd25bb
JA
6704 entry = ctx->timeout_list.prev;
6705 goto add;
6706 }
5262f567 6707
bfe68a22
PB
6708 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6709 req->timeout.target_seq = tail + off;
5262f567 6710
f010505b
MDG
6711 /* Update the last seq here in case io_flush_timeouts() hasn't.
6712 * This is safe because ->completion_lock is held, and submissions
6713 * and completions are never mixed in the same ->completion_lock section.
6714 */
6715 ctx->cq_last_tm_flush = tail;
6716
5262f567
JA
6717 /*
6718 * Insertion sort, ensuring the first entry in the list is always
6719 * the one we need first.
6720 */
5262f567 6721 list_for_each_prev(entry, &ctx->timeout_list) {
135fcde8
PB
6722 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6723 timeout.list);
5262f567 6724
8eb7e2d0 6725 if (io_is_timeout_noseq(nxt))
93bd25bb 6726 continue;
bfe68a22
PB
6727 /* nxt.seq is behind @tail, otherwise would've been completed */
6728 if (off >= nxt->timeout.target_seq - tail)
5262f567
JA
6729 break;
6730 }
93bd25bb 6731add:
135fcde8 6732 list_add(&req->timeout.list, entry);
ad8a48ac
JA
6733 data->timer.function = io_timeout_fn;
6734 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
89850fce 6735 spin_unlock_irq(&ctx->timeout_lock);
5262f567
JA
6736 return 0;
6737}
5262f567 6738
f458dd84
PB
6739struct io_cancel_data {
6740 struct io_ring_ctx *ctx;
6741 u64 user_data;
6742};
6743
62755e35
JA
6744static bool io_cancel_cb(struct io_wq_work *work, void *data)
6745{
6746 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f458dd84 6747 struct io_cancel_data *cd = data;
62755e35 6748
cef216fc 6749 return req->ctx == cd->ctx && req->cqe.user_data == cd->user_data;
62755e35
JA
6750}
6751
f458dd84
PB
6752static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6753 struct io_ring_ctx *ctx)
62755e35 6754{
f458dd84 6755 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
62755e35 6756 enum io_wq_cancel cancel_ret;
62755e35
JA
6757 int ret = 0;
6758
f458dd84 6759 if (!tctx || !tctx->io_wq)
5aa75ed5
JA
6760 return -ENOENT;
6761
f458dd84 6762 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
62755e35
JA
6763 switch (cancel_ret) {
6764 case IO_WQ_CANCEL_OK:
6765 ret = 0;
6766 break;
6767 case IO_WQ_CANCEL_RUNNING:
6768 ret = -EALREADY;
6769 break;
6770 case IO_WQ_CANCEL_NOTFOUND:
6771 ret = -ENOENT;
6772 break;
6773 }
6774
e977d6d3
JA
6775 return ret;
6776}
6777
8cb01fac 6778static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
47f46768 6779{
8cb01fac 6780 struct io_ring_ctx *ctx = req->ctx;
47f46768
JA
6781 int ret;
6782
dadebc35 6783 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
8cb01fac 6784
f458dd84 6785 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
ccbf7261
JA
6786 /*
6787 * Fall-through even for -EALREADY, as we may have poll armed
6788 * that need unarming.
6789 */
6790 if (!ret)
6791 return 0;
505657bc
PB
6792
6793 spin_lock(&ctx->completion_lock);
ccbf7261
JA
6794 ret = io_poll_cancel(ctx, sqe_addr, false);
6795 if (ret != -ENOENT)
6796 goto out;
6797
79ebeaee 6798 spin_lock_irq(&ctx->timeout_lock);
47f46768 6799 ret = io_timeout_cancel(ctx, sqe_addr);
79ebeaee 6800 spin_unlock_irq(&ctx->timeout_lock);
505657bc
PB
6801out:
6802 spin_unlock(&ctx->completion_lock);
6803 return ret;
47f46768
JA
6804}
6805
3529d8c2
JA
6806static int io_async_cancel_prep(struct io_kiocb *req,
6807 const struct io_uring_sqe *sqe)
e977d6d3 6808{
fbf23849 6809 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3 6810 return -EINVAL;
61710e43
DA
6811 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6812 return -EINVAL;
26578cda
PB
6813 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6814 sqe->splice_fd_in)
e977d6d3
JA
6815 return -EINVAL;
6816
fbf23849
JA
6817 req->cancel.addr = READ_ONCE(sqe->addr);
6818 return 0;
6819}
6820
61e98203 6821static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6822{
6823 struct io_ring_ctx *ctx = req->ctx;
58f99373
PB
6824 u64 sqe_addr = req->cancel.addr;
6825 struct io_tctx_node *node;
6826 int ret;
6827
8cb01fac 6828 ret = io_try_cancel_userdata(req, sqe_addr);
58f99373
PB
6829 if (ret != -ENOENT)
6830 goto done;
58f99373
PB
6831
6832 /* slow path, try all io-wq's */
f8929630 6833 io_ring_submit_lock(ctx, issue_flags);
58f99373
PB
6834 ret = -ENOENT;
6835 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6836 struct io_uring_task *tctx = node->task->io_uring;
fbf23849 6837
58f99373
PB
6838 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6839 if (ret != -ENOENT)
6840 break;
6841 }
f8929630 6842 io_ring_submit_unlock(ctx, issue_flags);
58f99373 6843done:
58f99373 6844 if (ret < 0)
93d2bcd2 6845 req_set_fail(req);
505657bc 6846 io_req_complete_post(req, ret, 0);
5262f567
JA
6847 return 0;
6848}
6849
269bbe5f 6850static int io_rsrc_update_prep(struct io_kiocb *req,
05f3fb3c
JA
6851 const struct io_uring_sqe *sqe)
6852{
61710e43
DA
6853 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6854 return -EINVAL;
26578cda 6855 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
05f3fb3c
JA
6856 return -EINVAL;
6857
269bbe5f
BM
6858 req->rsrc_update.offset = READ_ONCE(sqe->off);
6859 req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6860 if (!req->rsrc_update.nr_args)
05f3fb3c 6861 return -EINVAL;
269bbe5f 6862 req->rsrc_update.arg = READ_ONCE(sqe->addr);
05f3fb3c
JA
6863 return 0;
6864}
6865
889fca73 6866static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6867{
6868 struct io_ring_ctx *ctx = req->ctx;
c3bdad02 6869 struct io_uring_rsrc_update2 up;
05f3fb3c 6870 int ret;
fbf23849 6871
269bbe5f
BM
6872 up.offset = req->rsrc_update.offset;
6873 up.data = req->rsrc_update.arg;
c3bdad02
PB
6874 up.nr = 0;
6875 up.tags = 0;
615cee49 6876 up.resv = 0;
d8a3ba9c 6877 up.resv2 = 0;
05f3fb3c 6878
f8929630 6879 io_ring_submit_lock(ctx, issue_flags);
fdecb662 6880 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
98f0b3b4 6881 &up, req->rsrc_update.nr_args);
f8929630 6882 io_ring_submit_unlock(ctx, issue_flags);
05f3fb3c
JA
6883
6884 if (ret < 0)
93d2bcd2 6885 req_set_fail(req);
889fca73 6886 __io_req_complete(req, issue_flags, ret, 0);
5262f567
JA
6887 return 0;
6888}
6889
bfe76559 6890static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 6891{
d625c6ee 6892 switch (req->opcode) {
e781573e 6893 case IORING_OP_NOP:
bfe76559 6894 return 0;
f67676d1
JA
6895 case IORING_OP_READV:
6896 case IORING_OP_READ_FIXED:
3a6820f2 6897 case IORING_OP_READ:
f67676d1
JA
6898 case IORING_OP_WRITEV:
6899 case IORING_OP_WRITE_FIXED:
3a6820f2 6900 case IORING_OP_WRITE:
584b0180 6901 return io_prep_rw(req, sqe);
0969e783 6902 case IORING_OP_POLL_ADD:
bfe76559 6903 return io_poll_add_prep(req, sqe);
0969e783 6904 case IORING_OP_POLL_REMOVE:
c5de0036 6905 return io_poll_update_prep(req, sqe);
8ed8d3c3 6906 case IORING_OP_FSYNC:
1155c76a 6907 return io_fsync_prep(req, sqe);
8ed8d3c3 6908 case IORING_OP_SYNC_FILE_RANGE:
1155c76a 6909 return io_sfr_prep(req, sqe);
03b1230c 6910 case IORING_OP_SENDMSG:
fddaface 6911 case IORING_OP_SEND:
bfe76559 6912 return io_sendmsg_prep(req, sqe);
03b1230c 6913 case IORING_OP_RECVMSG:
fddaface 6914 case IORING_OP_RECV:
bfe76559 6915 return io_recvmsg_prep(req, sqe);
f499a021 6916 case IORING_OP_CONNECT:
bfe76559 6917 return io_connect_prep(req, sqe);
2d28390a 6918 case IORING_OP_TIMEOUT:
bfe76559 6919 return io_timeout_prep(req, sqe, false);
b29472ee 6920 case IORING_OP_TIMEOUT_REMOVE:
bfe76559 6921 return io_timeout_remove_prep(req, sqe);
fbf23849 6922 case IORING_OP_ASYNC_CANCEL:
bfe76559 6923 return io_async_cancel_prep(req, sqe);
2d28390a 6924 case IORING_OP_LINK_TIMEOUT:
bfe76559 6925 return io_timeout_prep(req, sqe, true);
8ed8d3c3 6926 case IORING_OP_ACCEPT:
bfe76559 6927 return io_accept_prep(req, sqe);
d63d1b5e 6928 case IORING_OP_FALLOCATE:
bfe76559 6929 return io_fallocate_prep(req, sqe);
15b71abe 6930 case IORING_OP_OPENAT:
bfe76559 6931 return io_openat_prep(req, sqe);
b5dba59e 6932 case IORING_OP_CLOSE:
bfe76559 6933 return io_close_prep(req, sqe);
05f3fb3c 6934 case IORING_OP_FILES_UPDATE:
269bbe5f 6935 return io_rsrc_update_prep(req, sqe);
eddc7ef5 6936 case IORING_OP_STATX:
bfe76559 6937 return io_statx_prep(req, sqe);
4840e418 6938 case IORING_OP_FADVISE:
bfe76559 6939 return io_fadvise_prep(req, sqe);
c1ca757b 6940 case IORING_OP_MADVISE:
bfe76559 6941 return io_madvise_prep(req, sqe);
cebdb986 6942 case IORING_OP_OPENAT2:
bfe76559 6943 return io_openat2_prep(req, sqe);
3e4827b0 6944 case IORING_OP_EPOLL_CTL:
bfe76559 6945 return io_epoll_ctl_prep(req, sqe);
7d67af2c 6946 case IORING_OP_SPLICE:
bfe76559 6947 return io_splice_prep(req, sqe);
ddf0322d 6948 case IORING_OP_PROVIDE_BUFFERS:
bfe76559 6949 return io_provide_buffers_prep(req, sqe);
067524e9 6950 case IORING_OP_REMOVE_BUFFERS:
bfe76559 6951 return io_remove_buffers_prep(req, sqe);
f2a8d5c7 6952 case IORING_OP_TEE:
bfe76559 6953 return io_tee_prep(req, sqe);
36f4fa68
JA
6954 case IORING_OP_SHUTDOWN:
6955 return io_shutdown_prep(req, sqe);
80a261fd
JA
6956 case IORING_OP_RENAMEAT:
6957 return io_renameat_prep(req, sqe);
14a1143b
JA
6958 case IORING_OP_UNLINKAT:
6959 return io_unlinkat_prep(req, sqe);
e34a02dc
DK
6960 case IORING_OP_MKDIRAT:
6961 return io_mkdirat_prep(req, sqe);
7a8721f8
DK
6962 case IORING_OP_SYMLINKAT:
6963 return io_symlinkat_prep(req, sqe);
cf30da90
DK
6964 case IORING_OP_LINKAT:
6965 return io_linkat_prep(req, sqe);
4f57f06c
JA
6966 case IORING_OP_MSG_RING:
6967 return io_msg_ring_prep(req, sqe);
f67676d1
JA
6968 }
6969
bfe76559
PB
6970 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6971 req->opcode);
bd54b6fe 6972 return -EINVAL;
bfe76559
PB
6973}
6974
93642ef8 6975static int io_req_prep_async(struct io_kiocb *req)
bfe76559 6976{
b7e298d2
PB
6977 if (!io_op_defs[req->opcode].needs_async_setup)
6978 return 0;
d886e185 6979 if (WARN_ON_ONCE(req_has_async_data(req)))
b7e298d2
PB
6980 return -EFAULT;
6981 if (io_alloc_async_data(req))
6982 return -EAGAIN;
6983
93642ef8
PB
6984 switch (req->opcode) {
6985 case IORING_OP_READV:
93642ef8
PB
6986 return io_rw_prep_async(req, READ);
6987 case IORING_OP_WRITEV:
93642ef8
PB
6988 return io_rw_prep_async(req, WRITE);
6989 case IORING_OP_SENDMSG:
93642ef8
PB
6990 return io_sendmsg_prep_async(req);
6991 case IORING_OP_RECVMSG:
93642ef8
PB
6992 return io_recvmsg_prep_async(req);
6993 case IORING_OP_CONNECT:
6994 return io_connect_prep_async(req);
6995 }
b7e298d2
PB
6996 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6997 req->opcode);
6998 return -EFAULT;
f67676d1
JA
6999}
7000
9cf7c104
PB
7001static u32 io_get_sequence(struct io_kiocb *req)
7002{
a3dbdf54 7003 u32 seq = req->ctx->cached_sq_head;
963c6abb 7004 struct io_kiocb *cur;
9cf7c104 7005
a3dbdf54 7006 /* need original cached_sq_head, but it was increased for each req */
963c6abb 7007 io_for_each_link(cur, req)
a3dbdf54
PB
7008 seq--;
7009 return seq;
9cf7c104
PB
7010}
7011
c072481d 7012static __cold void io_drain_req(struct io_kiocb *req)
de0617e4 7013{
a197f664 7014 struct io_ring_ctx *ctx = req->ctx;
27dc8338 7015 struct io_defer_entry *de;
f67676d1 7016 int ret;
e0eb71dc 7017 u32 seq = io_get_sequence(req);
3c19966d 7018
9d858b21 7019 /* Still need defer if there is pending req in defer list. */
e302f104 7020 spin_lock(&ctx->completion_lock);
5e371265 7021 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
e302f104 7022 spin_unlock(&ctx->completion_lock);
e0eb71dc 7023queue:
10c66904 7024 ctx->drain_active = false;
e0eb71dc
PB
7025 io_req_task_queue(req);
7026 return;
10c66904 7027 }
e302f104 7028 spin_unlock(&ctx->completion_lock);
9cf7c104 7029
b7e298d2 7030 ret = io_req_prep_async(req);
e0eb71dc
PB
7031 if (ret) {
7032fail:
7033 io_req_complete_failed(req, ret);
7034 return;
7035 }
cbdcb435 7036 io_prep_async_link(req);
27dc8338 7037 de = kmalloc(sizeof(*de), GFP_KERNEL);
76cc33d7 7038 if (!de) {
1b48773f 7039 ret = -ENOMEM;
e0eb71dc 7040 goto fail;
76cc33d7 7041 }
2d28390a 7042
79ebeaee 7043 spin_lock(&ctx->completion_lock);
9cf7c104 7044 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
79ebeaee 7045 spin_unlock(&ctx->completion_lock);
27dc8338 7046 kfree(de);
e0eb71dc 7047 goto queue;
de0617e4
JA
7048 }
7049
cef216fc 7050 trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
27dc8338 7051 de->req = req;
9cf7c104 7052 de->seq = seq;
27dc8338 7053 list_add_tail(&de->list, &ctx->defer_list);
79ebeaee 7054 spin_unlock(&ctx->completion_lock);
de0617e4
JA
7055}
7056
68fb8979 7057static void io_clean_op(struct io_kiocb *req)
99bc4c38 7058{
8197b053
PB
7059 if (req->flags & REQ_F_BUFFER_SELECTED) {
7060 spin_lock(&req->ctx->completion_lock);
cc3cec83 7061 io_put_kbuf_comp(req);
8197b053
PB
7062 spin_unlock(&req->ctx->completion_lock);
7063 }
99bc4c38 7064
0e1b6fe3
PB
7065 if (req->flags & REQ_F_NEED_CLEANUP) {
7066 switch (req->opcode) {
7067 case IORING_OP_READV:
7068 case IORING_OP_READ_FIXED:
7069 case IORING_OP_READ:
7070 case IORING_OP_WRITEV:
7071 case IORING_OP_WRITE_FIXED:
e8c2bc1f
JA
7072 case IORING_OP_WRITE: {
7073 struct io_async_rw *io = req->async_data;
1dacb4df
PB
7074
7075 kfree(io->free_iovec);
0e1b6fe3 7076 break;
e8c2bc1f 7077 }
0e1b6fe3 7078 case IORING_OP_RECVMSG:
e8c2bc1f
JA
7079 case IORING_OP_SENDMSG: {
7080 struct io_async_msghdr *io = req->async_data;
257e84a5
PB
7081
7082 kfree(io->free_iov);
0e1b6fe3 7083 break;
e8c2bc1f 7084 }
f3cd4850
JA
7085 case IORING_OP_OPENAT:
7086 case IORING_OP_OPENAT2:
7087 if (req->open.filename)
7088 putname(req->open.filename);
7089 break;
80a261fd
JA
7090 case IORING_OP_RENAMEAT:
7091 putname(req->rename.oldpath);
7092 putname(req->rename.newpath);
7093 break;
14a1143b
JA
7094 case IORING_OP_UNLINKAT:
7095 putname(req->unlink.filename);
7096 break;
e34a02dc
DK
7097 case IORING_OP_MKDIRAT:
7098 putname(req->mkdir.filename);
7099 break;
7a8721f8
DK
7100 case IORING_OP_SYMLINKAT:
7101 putname(req->symlink.oldpath);
7102 putname(req->symlink.newpath);
7103 break;
cf30da90
DK
7104 case IORING_OP_LINKAT:
7105 putname(req->hardlink.oldpath);
7106 putname(req->hardlink.newpath);
7107 break;
1b6fe6e0
SR
7108 case IORING_OP_STATX:
7109 if (req->statx.filename)
7110 putname(req->statx.filename);
7111 break;
0e1b6fe3 7112 }
99bc4c38 7113 }
75652a30
JA
7114 if ((req->flags & REQ_F_POLLED) && req->apoll) {
7115 kfree(req->apoll->double_poll);
7116 kfree(req->apoll);
7117 req->apoll = NULL;
7118 }
c854357b 7119 if (req->flags & REQ_F_CREDS)
b8e64b53 7120 put_cred(req->creds);
d886e185
PB
7121 if (req->flags & REQ_F_ASYNC_DATA) {
7122 kfree(req->async_data);
7123 req->async_data = NULL;
7124 }
c854357b 7125 req->flags &= ~IO_REQ_CLEAN_FLAGS;
99bc4c38
PB
7126}
7127
6bf9c47a
JA
7128static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
7129{
7130 if (req->file || !io_op_defs[req->opcode].needs_file)
7131 return true;
7132
7133 if (req->flags & REQ_F_FIXED_FILE)
cef216fc 7134 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
6bf9c47a 7135 else
cef216fc 7136 req->file = io_file_get_normal(req, req->cqe.fd);
6bf9c47a
JA
7137 if (req->file)
7138 return true;
7139
7140 req_set_fail(req);
cef216fc 7141 req->cqe.res = -EBADF;
6bf9c47a
JA
7142 return false;
7143}
7144
889fca73 7145static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 7146{
5730b27e 7147 const struct cred *creds = NULL;
d625c6ee 7148 int ret;
2b188cc1 7149
70152140
JA
7150 if (unlikely(!io_assign_file(req, issue_flags)))
7151 return -EBADF;
7152
6878b40e 7153 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
c10d1f98 7154 creds = override_creds(req->creds);
5730b27e 7155
5bd2182d
PM
7156 if (!io_op_defs[req->opcode].audit_skip)
7157 audit_uring_entry(req->opcode);
7158
d625c6ee 7159 switch (req->opcode) {
2b188cc1 7160 case IORING_OP_NOP:
889fca73 7161 ret = io_nop(req, issue_flags);
2b188cc1
JA
7162 break;
7163 case IORING_OP_READV:
edafccee 7164 case IORING_OP_READ_FIXED:
3a6820f2 7165 case IORING_OP_READ:
889fca73 7166 ret = io_read(req, issue_flags);
edafccee 7167 break;
3529d8c2 7168 case IORING_OP_WRITEV:
edafccee 7169 case IORING_OP_WRITE_FIXED:
3a6820f2 7170 case IORING_OP_WRITE:
889fca73 7171 ret = io_write(req, issue_flags);
2b188cc1 7172 break;
c992fe29 7173 case IORING_OP_FSYNC:
45d189c6 7174 ret = io_fsync(req, issue_flags);
c992fe29 7175 break;
221c5eb2 7176 case IORING_OP_POLL_ADD:
61e98203 7177 ret = io_poll_add(req, issue_flags);
221c5eb2
JA
7178 break;
7179 case IORING_OP_POLL_REMOVE:
c5de0036 7180 ret = io_poll_update(req, issue_flags);
221c5eb2 7181 break;
5d17b4a4 7182 case IORING_OP_SYNC_FILE_RANGE:
45d189c6 7183 ret = io_sync_file_range(req, issue_flags);
5d17b4a4 7184 break;
0fa03c62 7185 case IORING_OP_SENDMSG:
889fca73 7186 ret = io_sendmsg(req, issue_flags);
062d04d7 7187 break;
fddaface 7188 case IORING_OP_SEND:
889fca73 7189 ret = io_send(req, issue_flags);
0fa03c62 7190 break;
aa1fa28f 7191 case IORING_OP_RECVMSG:
889fca73 7192 ret = io_recvmsg(req, issue_flags);
062d04d7 7193 break;
fddaface 7194 case IORING_OP_RECV:
889fca73 7195 ret = io_recv(req, issue_flags);
aa1fa28f 7196 break;
5262f567 7197 case IORING_OP_TIMEOUT:
61e98203 7198 ret = io_timeout(req, issue_flags);
5262f567 7199 break;
11365043 7200 case IORING_OP_TIMEOUT_REMOVE:
61e98203 7201 ret = io_timeout_remove(req, issue_flags);
11365043 7202 break;
17f2fe35 7203 case IORING_OP_ACCEPT:
889fca73 7204 ret = io_accept(req, issue_flags);
17f2fe35 7205 break;
f8e85cf2 7206 case IORING_OP_CONNECT:
889fca73 7207 ret = io_connect(req, issue_flags);
f8e85cf2 7208 break;
62755e35 7209 case IORING_OP_ASYNC_CANCEL:
61e98203 7210 ret = io_async_cancel(req, issue_flags);
62755e35 7211 break;
d63d1b5e 7212 case IORING_OP_FALLOCATE:
45d189c6 7213 ret = io_fallocate(req, issue_flags);
d63d1b5e 7214 break;
15b71abe 7215 case IORING_OP_OPENAT:
45d189c6 7216 ret = io_openat(req, issue_flags);
15b71abe 7217 break;
b5dba59e 7218 case IORING_OP_CLOSE:
889fca73 7219 ret = io_close(req, issue_flags);
b5dba59e 7220 break;
05f3fb3c 7221 case IORING_OP_FILES_UPDATE:
889fca73 7222 ret = io_files_update(req, issue_flags);
05f3fb3c 7223 break;
eddc7ef5 7224 case IORING_OP_STATX:
45d189c6 7225 ret = io_statx(req, issue_flags);
eddc7ef5 7226 break;
4840e418 7227 case IORING_OP_FADVISE:
45d189c6 7228 ret = io_fadvise(req, issue_flags);
4840e418 7229 break;
c1ca757b 7230 case IORING_OP_MADVISE:
45d189c6 7231 ret = io_madvise(req, issue_flags);
c1ca757b 7232 break;
cebdb986 7233 case IORING_OP_OPENAT2:
45d189c6 7234 ret = io_openat2(req, issue_flags);
cebdb986 7235 break;
3e4827b0 7236 case IORING_OP_EPOLL_CTL:
889fca73 7237 ret = io_epoll_ctl(req, issue_flags);
3e4827b0 7238 break;
7d67af2c 7239 case IORING_OP_SPLICE:
45d189c6 7240 ret = io_splice(req, issue_flags);
7d67af2c 7241 break;
ddf0322d 7242 case IORING_OP_PROVIDE_BUFFERS:
889fca73 7243 ret = io_provide_buffers(req, issue_flags);
ddf0322d 7244 break;
067524e9 7245 case IORING_OP_REMOVE_BUFFERS:
889fca73 7246 ret = io_remove_buffers(req, issue_flags);
3e4827b0 7247 break;
f2a8d5c7 7248 case IORING_OP_TEE:
45d189c6 7249 ret = io_tee(req, issue_flags);
f2a8d5c7 7250 break;
36f4fa68 7251 case IORING_OP_SHUTDOWN:
45d189c6 7252 ret = io_shutdown(req, issue_flags);
36f4fa68 7253 break;
80a261fd 7254 case IORING_OP_RENAMEAT:
45d189c6 7255 ret = io_renameat(req, issue_flags);
80a261fd 7256 break;
14a1143b 7257 case IORING_OP_UNLINKAT:
45d189c6 7258 ret = io_unlinkat(req, issue_flags);
14a1143b 7259 break;
e34a02dc
DK
7260 case IORING_OP_MKDIRAT:
7261 ret = io_mkdirat(req, issue_flags);
7262 break;
7a8721f8
DK
7263 case IORING_OP_SYMLINKAT:
7264 ret = io_symlinkat(req, issue_flags);
7265 break;
cf30da90
DK
7266 case IORING_OP_LINKAT:
7267 ret = io_linkat(req, issue_flags);
7268 break;
4f57f06c
JA
7269 case IORING_OP_MSG_RING:
7270 ret = io_msg_ring(req, issue_flags);
7271 break;
2b188cc1
JA
7272 default:
7273 ret = -EINVAL;
7274 break;
7275 }
7276
5bd2182d
PM
7277 if (!io_op_defs[req->opcode].audit_skip)
7278 audit_uring_exit(!ret, ret);
7279
5730b27e
JA
7280 if (creds)
7281 revert_creds(creds);
def596e9
JA
7282 if (ret)
7283 return ret;
b532576e 7284 /* If the op doesn't have a file, we're not polling for it */
9983028e 7285 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
9882131c 7286 io_iopoll_req_issued(req, issue_flags);
def596e9
JA
7287
7288 return 0;
2b188cc1
JA
7289}
7290
ebc11b6c
PB
7291static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
7292{
7293 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7294
7295 req = io_put_req_find_next(req);
7296 return req ? &req->work : NULL;
7297}
7298
5280f7e5 7299static void io_wq_submit_work(struct io_wq_work *work)
2b188cc1
JA
7300{
7301 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6bf9c47a 7302 const struct io_op_def *def = &io_op_defs[req->opcode];
d01905db
PB
7303 unsigned int issue_flags = IO_URING_F_UNLOCKED;
7304 bool needs_poll = false;
6bf9c47a 7305 int ret = 0, err = -ECANCELED;
2b188cc1 7306
48dcd38d
PB
7307 /* one will be dropped by ->io_free_work() after returning to io-wq */
7308 if (!(req->flags & REQ_F_REFCOUNT))
7309 __io_req_set_refcount(req, 2);
7310 else
7311 req_ref_get(req);
5d5901a3 7312
cb2d344c 7313 io_arm_ltimeout(req);
6bf9c47a 7314
dadebc35 7315 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
d01905db 7316 if (work->flags & IO_WQ_WORK_CANCEL) {
0f8da75b 7317fail:
6bf9c47a 7318 io_req_task_queue_fail(req, err);
d01905db
PB
7319 return;
7320 }
0f8da75b
PB
7321 if (!io_assign_file(req, issue_flags)) {
7322 err = -EBADF;
7323 work->flags |= IO_WQ_WORK_CANCEL;
7324 goto fail;
7325 }
31b51510 7326
d01905db 7327 if (req->flags & REQ_F_FORCE_ASYNC) {
afb7f56f
PB
7328 bool opcode_poll = def->pollin || def->pollout;
7329
7330 if (opcode_poll && file_can_poll(req->file)) {
7331 needs_poll = true;
d01905db 7332 issue_flags |= IO_URING_F_NONBLOCK;
afb7f56f 7333 }
561fb04a 7334 }
31b51510 7335
d01905db
PB
7336 do {
7337 ret = io_issue_sqe(req, issue_flags);
7338 if (ret != -EAGAIN)
7339 break;
7340 /*
7341 * We can get EAGAIN for iopolled IO even though we're
7342 * forcing a sync submission from here, since we can't
7343 * wait for request slots on the block side.
7344 */
7345 if (!needs_poll) {
7346 cond_resched();
7347 continue;
90fa0288
HX
7348 }
7349
4d9237e3 7350 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
d01905db
PB
7351 return;
7352 /* aborted or ready, in either case retry blocking */
7353 needs_poll = false;
7354 issue_flags &= ~IO_URING_F_NONBLOCK;
7355 } while (1);
31b51510 7356
a3df7698 7357 /* avoid locking problems by failing it from a clean context */
5d5901a3 7358 if (ret)
a3df7698 7359 io_req_task_queue_fail(req, ret);
2b188cc1
JA
7360}
7361
aeca241b 7362static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
042b0d85 7363 unsigned i)
65e19f54 7364{
042b0d85 7365 return &table->files[i];
dafecf19
PB
7366}
7367
65e19f54
JA
7368static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
7369 int index)
7370{
aeca241b 7371 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
65e19f54 7372
a04b0ac0 7373 return (struct file *) (slot->file_ptr & FFS_MASK);
65e19f54
JA
7374}
7375
a04b0ac0 7376static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
9a321c98
PB
7377{
7378 unsigned long file_ptr = (unsigned long) file;
7379
88459b50 7380 file_ptr |= io_file_get_flags(file);
a04b0ac0 7381 file_slot->file_ptr = file_ptr;
65e19f54
JA
7382}
7383
5106dd6e
JA
7384static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
7385 unsigned int issue_flags)
09bb8394 7386{
5106dd6e
JA
7387 struct io_ring_ctx *ctx = req->ctx;
7388 struct file *file = NULL;
ac177053 7389 unsigned long file_ptr;
09bb8394 7390
5106dd6e
JA
7391 if (issue_flags & IO_URING_F_UNLOCKED)
7392 mutex_lock(&ctx->uring_lock);
7393
ac177053 7394 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
5106dd6e 7395 goto out;
ac177053
PB
7396 fd = array_index_nospec(fd, ctx->nr_user_files);
7397 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
7398 file = (struct file *) (file_ptr & FFS_MASK);
7399 file_ptr &= ~FFS_MASK;
7400 /* mask in overlapping REQ_F and FFS bits */
35645ac3 7401 req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
5106dd6e
JA
7402 io_req_set_rsrc_node(req, ctx, 0);
7403out:
7404 if (issue_flags & IO_URING_F_UNLOCKED)
7405 mutex_unlock(&ctx->uring_lock);
ac177053
PB
7406 return file;
7407}
d44f554e 7408
d5361233
JA
7409/*
7410 * Drop the file for requeue operations. Only used of req->file is the
7411 * io_uring descriptor itself.
7412 */
7413static void io_drop_inflight_file(struct io_kiocb *req)
7414{
7415 if (unlikely(req->flags & REQ_F_INFLIGHT)) {
7416 fput(req->file);
7417 req->file = NULL;
7418 req->flags &= ~REQ_F_INFLIGHT;
7419 }
7420}
7421
5106dd6e 7422static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
ac177053 7423{
62906e89 7424 struct file *file = fget(fd);
ac177053 7425
cef216fc 7426 trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
09bb8394 7427
ac177053 7428 /* we don't allow fixed io_uring files */
d5361233
JA
7429 if (file && file->f_op == &io_uring_fops)
7430 req->flags |= REQ_F_INFLIGHT;
8371adf5 7431 return file;
09bb8394
JA
7432}
7433
f237c30a 7434static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
89b263f6
JA
7435{
7436 struct io_kiocb *prev = req->timeout.prev;
617a8948 7437 int ret = -ENOENT;
89b263f6
JA
7438
7439 if (prev) {
617a8948 7440 if (!(req->task->flags & PF_EXITING))
cef216fc 7441 ret = io_try_cancel_userdata(req, prev->cqe.user_data);
505657bc 7442 io_req_complete_post(req, ret ?: -ETIME, 0);
89b263f6 7443 io_put_req(prev);
89b263f6
JA
7444 } else {
7445 io_req_complete_post(req, -ETIME, 0);
7446 }
7447}
7448
2665abfd 7449static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 7450{
ad8a48ac
JA
7451 struct io_timeout_data *data = container_of(timer,
7452 struct io_timeout_data, timer);
90cd7e42 7453 struct io_kiocb *prev, *req = data->req;
2665abfd 7454 struct io_ring_ctx *ctx = req->ctx;
2665abfd 7455 unsigned long flags;
2665abfd 7456
89b263f6 7457 spin_lock_irqsave(&ctx->timeout_lock, flags);
90cd7e42
PB
7458 prev = req->timeout.head;
7459 req->timeout.head = NULL;
2665abfd
JA
7460
7461 /*
7462 * We don't expect the list to be empty, that will only happen if we
7463 * race with the completion of the linked work.
7464 */
447c19f3 7465 if (prev) {
f2f87370 7466 io_remove_next_linked(prev);
447c19f3
PB
7467 if (!req_ref_inc_not_zero(prev))
7468 prev = NULL;
7469 }
ef9dd637 7470 list_del(&req->timeout.list);
89b263f6
JA
7471 req->timeout.prev = prev;
7472 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
2665abfd 7473
89b263f6 7474 req->io_task_work.func = io_req_task_link_timeout;
4813c377 7475 io_req_task_work_add(req, false);
2665abfd
JA
7476 return HRTIMER_NORESTART;
7477}
7478
de968c18 7479static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 7480{
de968c18
PB
7481 struct io_ring_ctx *ctx = req->ctx;
7482
89b263f6 7483 spin_lock_irq(&ctx->timeout_lock);
76a46e06 7484 /*
f2f87370
PB
7485 * If the back reference is NULL, then our linked request finished
7486 * before we got a chance to setup the timer
76a46e06 7487 */
90cd7e42 7488 if (req->timeout.head) {
e8c2bc1f 7489 struct io_timeout_data *data = req->async_data;
94ae5e77 7490
ad8a48ac
JA
7491 data->timer.function = io_link_timeout_fn;
7492 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
7493 data->mode);
ef9dd637 7494 list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
2665abfd 7495 }
89b263f6 7496 spin_unlock_irq(&ctx->timeout_lock);
2665abfd 7497 /* drop submission reference */
76a46e06
JA
7498 io_put_req(req);
7499}
2665abfd 7500
7bfa9bad 7501static void io_queue_async(struct io_kiocb *req, int ret)
d475a9a6
PB
7502 __must_hold(&req->ctx->uring_lock)
7503{
7bfa9bad
PB
7504 struct io_kiocb *linked_timeout;
7505
7506 if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
7507 io_req_complete_failed(req, ret);
7508 return;
7509 }
7510
7511 linked_timeout = io_prep_linked_timeout(req);
d475a9a6 7512
4d9237e3 7513 switch (io_arm_poll_handler(req, 0)) {
d475a9a6 7514 case IO_APOLL_READY:
d475a9a6
PB
7515 io_req_task_queue(req);
7516 break;
7517 case IO_APOLL_ABORTED:
7518 /*
7519 * Queued up for async execution, worker will release
7520 * submit reference when the iocb is actually submitted.
7521 */
77955efb 7522 io_queue_iowq(req, NULL);
d475a9a6 7523 break;
b1c62645 7524 case IO_APOLL_OK:
b1c62645 7525 break;
d475a9a6
PB
7526 }
7527
7528 if (linked_timeout)
7529 io_queue_linked_timeout(linked_timeout);
7530}
7531
cbc2e203 7532static inline void io_queue_sqe(struct io_kiocb *req)
282cdc86 7533 __must_hold(&req->ctx->uring_lock)
2b188cc1 7534{
e0c5c576 7535 int ret;
2b188cc1 7536
c5eef2b9 7537 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
193155c8 7538
fff4e40e
PB
7539 if (req->flags & REQ_F_COMPLETE_INLINE) {
7540 io_req_add_compl_list(req);
d9f9d284 7541 return;
fff4e40e 7542 }
491381ce
JA
7543 /*
7544 * We async punt it if the file wasn't marked NOWAIT, or if the file
7545 * doesn't support non-blocking read/write attempts
7546 */
7bfa9bad 7547 if (likely(!ret))
cb2d344c 7548 io_arm_ltimeout(req);
7bfa9bad
PB
7549 else
7550 io_queue_async(req, ret);
2b188cc1
JA
7551}
7552
4652fe3f 7553static void io_queue_sqe_fallback(struct io_kiocb *req)
282cdc86 7554 __must_hold(&req->ctx->uring_lock)
4fe2c963 7555{
17b147f6
PB
7556 if (unlikely(req->flags & REQ_F_FAIL)) {
7557 /*
7558 * We don't submit, fail them all, for that replace hardlinks
7559 * with normal links. Extra REQ_F_LINK is tolerated.
7560 */
7561 req->flags &= ~REQ_F_HARDLINK;
7562 req->flags |= REQ_F_LINK;
7563 io_req_complete_failed(req, req->cqe.res);
e0eb71dc
PB
7564 } else if (unlikely(req->ctx->drain_active)) {
7565 io_drain_req(req);
76cc33d7
PB
7566 } else {
7567 int ret = io_req_prep_async(req);
7568
7569 if (unlikely(ret))
7570 io_req_complete_failed(req, ret);
7571 else
77955efb 7572 io_queue_iowq(req, NULL);
ce35a47a 7573 }
4fe2c963
JL
7574}
7575
b16fed66
PB
7576/*
7577 * Check SQE restrictions (opcode and flags).
7578 *
7579 * Returns 'true' if SQE is allowed, 'false' otherwise.
7580 */
7581static inline bool io_check_restriction(struct io_ring_ctx *ctx,
7582 struct io_kiocb *req,
7583 unsigned int sqe_flags)
4fe2c963 7584{
b16fed66
PB
7585 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
7586 return false;
7587
7588 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
7589 ctx->restrictions.sqe_flags_required)
7590 return false;
7591
7592 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
7593 ctx->restrictions.sqe_flags_required))
7594 return false;
7595
7596 return true;
4fe2c963
JL
7597}
7598
22b2ca31
PB
7599static void io_init_req_drain(struct io_kiocb *req)
7600{
7601 struct io_ring_ctx *ctx = req->ctx;
7602 struct io_kiocb *head = ctx->submit_state.link.head;
7603
7604 ctx->drain_active = true;
7605 if (head) {
7606 /*
7607 * If we need to drain a request in the middle of a link, drain
7608 * the head request and the next request/link after the current
7609 * link. Considering sequential execution of links,
b6c7db32 7610 * REQ_F_IO_DRAIN will be maintained for every request of our
22b2ca31
PB
7611 * link.
7612 */
b6c7db32 7613 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
22b2ca31
PB
7614 ctx->drain_next = true;
7615 }
7616}
7617
b16fed66
PB
7618static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
7619 const struct io_uring_sqe *sqe)
282cdc86 7620 __must_hold(&ctx->uring_lock)
b16fed66 7621{
b16fed66 7622 unsigned int sqe_flags;
fc0ae024 7623 int personality;
4a04d1d1 7624 u8 opcode;
b16fed66 7625
864ea921 7626 /* req is partially pre-initialised, see io_preinit_req() */
4a04d1d1 7627 req->opcode = opcode = READ_ONCE(sqe->opcode);
b16fed66
PB
7628 /* same numerical values with corresponding REQ_F_*, safe to copy */
7629 req->flags = sqe_flags = READ_ONCE(sqe->flags);
cef216fc 7630 req->cqe.user_data = READ_ONCE(sqe->user_data);
b16fed66 7631 req->file = NULL;
b16fed66 7632 req->fixed_rsrc_refs = NULL;
b16fed66 7633 req->task = current;
b16fed66 7634
4a04d1d1
PB
7635 if (unlikely(opcode >= IORING_OP_LAST)) {
7636 req->opcode = 0;
b16fed66 7637 return -EINVAL;
4a04d1d1 7638 }
68fe256a
PB
7639 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
7640 /* enforce forwards compatibility on users */
7641 if (sqe_flags & ~SQE_VALID_FLAGS)
7642 return -EINVAL;
7643 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
4a04d1d1 7644 !io_op_defs[opcode].buffer_select)
68fe256a 7645 return -EOPNOTSUPP;
5562a8d7
PB
7646 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
7647 ctx->drain_disabled = true;
7648 if (sqe_flags & IOSQE_IO_DRAIN) {
7649 if (ctx->drain_disabled)
7650 return -EOPNOTSUPP;
22b2ca31 7651 io_init_req_drain(req);
5562a8d7 7652 }
2a56a9bd
PB
7653 }
7654 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
7655 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
7656 return -EACCES;
7657 /* knock it to the slow queue path, will be drained there */
7658 if (ctx->drain_active)
7659 req->flags |= REQ_F_FORCE_ASYNC;
7660 /* if there is no link, we're at "next" request and need to drain */
7661 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
7662 ctx->drain_next = false;
7663 ctx->drain_active = true;
b6c7db32 7664 req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
2a56a9bd 7665 }
68fe256a 7666 }
b16fed66 7667
4a04d1d1 7668 if (io_op_defs[opcode].needs_file) {
6d63416d
PB
7669 struct io_submit_state *state = &ctx->submit_state;
7670
cef216fc 7671 req->cqe.fd = READ_ONCE(sqe->fd);
6bf9c47a 7672
6d63416d
PB
7673 /*
7674 * Plug now if we have more than 2 IO left after this, and the
7675 * target is potentially a read/write to block based storage.
7676 */
4a04d1d1 7677 if (state->need_plug && io_op_defs[opcode].plug) {
6d63416d
PB
7678 state->plug_started = true;
7679 state->need_plug = false;
5ca7a8b3 7680 blk_start_plug_nr_ios(&state->plug, state->submit_nr);
6d63416d 7681 }
b16fed66 7682 }
863e0560 7683
003e8dcc
JA
7684 personality = READ_ONCE(sqe->personality);
7685 if (personality) {
cdab10bf
LT
7686 int ret;
7687
c10d1f98
PB
7688 req->creds = xa_load(&ctx->personalities, personality);
7689 if (!req->creds)
003e8dcc 7690 return -EINVAL;
c10d1f98 7691 get_cred(req->creds);
cdc1404a
PM
7692 ret = security_uring_override_creds(req->creds);
7693 if (ret) {
7694 put_cred(req->creds);
7695 return ret;
7696 }
b8e64b53 7697 req->flags |= REQ_F_CREDS;
003e8dcc 7698 }
b16fed66 7699
fc0ae024 7700 return io_req_prep(req, sqe);
b16fed66
PB
7701}
7702
df3becde
PB
7703static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
7704 struct io_kiocb *req, int ret)
7705{
7706 struct io_ring_ctx *ctx = req->ctx;
7707 struct io_submit_link *link = &ctx->submit_state.link;
7708 struct io_kiocb *head = link->head;
7709
7710 trace_io_uring_req_failed(sqe, ctx, req, ret);
7711
7712 /*
7713 * Avoid breaking links in the middle as it renders links with SQPOLL
7714 * unusable. Instead of failing eagerly, continue assembling the link if
7715 * applicable and mark the head with REQ_F_FAIL. The link flushing code
7716 * should find the flag and handle the rest.
7717 */
7718 req_fail_link_node(req, ret);
7719 if (head && !(head->flags & REQ_F_FAIL))
7720 req_fail_link_node(head, -ECANCELED);
7721
7722 if (!(req->flags & IO_REQ_LINK_FLAGS)) {
7723 if (head) {
7724 link->last->link = req;
7725 link->head = NULL;
7726 req = head;
7727 }
7728 io_queue_sqe_fallback(req);
7729 return ret;
7730 }
7731
7732 if (head)
7733 link->last->link = req;
7734 else
7735 link->head = req;
7736 link->last = req;
7737 return 0;
7738}
7739
7740static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
a1ab7b35 7741 const struct io_uring_sqe *sqe)
282cdc86 7742 __must_hold(&ctx->uring_lock)
9e645e11 7743{
a1ab7b35 7744 struct io_submit_link *link = &ctx->submit_state.link;
ef4ff581 7745 int ret;
9e645e11 7746
a6b8cadc 7747 ret = io_init_req(ctx, req, sqe);
df3becde
PB
7748 if (unlikely(ret))
7749 return io_submit_fail_init(sqe, req, ret);
441b8a78 7750
be7053b7 7751 /* don't need @sqe from now on */
cef216fc 7752 trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
236daeae
OL
7753 req->flags, true,
7754 ctx->flags & IORING_SETUP_SQPOLL);
a6b8cadc 7755
9e645e11
JA
7756 /*
7757 * If we already have a head request, queue this one for async
7758 * submittal once the head completes. If we don't have a head but
7759 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7760 * submitted sync once the chain is complete. If none of those
7761 * conditions are true (normal request), then just queue it.
7762 */
924a07e4 7763 if (unlikely(link->head)) {
df3becde
PB
7764 ret = io_req_prep_async(req);
7765 if (unlikely(ret))
7766 return io_submit_fail_init(sqe, req, ret);
7767
7768 trace_io_uring_link(ctx, req, link->head);
f2f87370 7769 link->last->link = req;
863e0560 7770 link->last = req;
32fe525b 7771
da1a08c5 7772 if (req->flags & IO_REQ_LINK_FLAGS)
f15a3431 7773 return 0;
df3becde
PB
7774 /* last request of the link, flush it */
7775 req = link->head;
f15a3431 7776 link->head = NULL;
924a07e4
PB
7777 if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
7778 goto fallback;
7779
7780 } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
7781 REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
7782 if (req->flags & IO_REQ_LINK_FLAGS) {
7783 link->head = req;
7784 link->last = req;
7785 } else {
7786fallback:
7787 io_queue_sqe_fallback(req);
7788 }
f15a3431 7789 return 0;
9e645e11 7790 }
2e6e1fde 7791
924a07e4 7792 io_queue_sqe(req);
1d4240cc 7793 return 0;
9e645e11
JA
7794}
7795
9a56a232
JA
7796/*
7797 * Batched submission is done, ensure local IO is flushed out.
7798 */
553deffd 7799static void io_submit_state_end(struct io_ring_ctx *ctx)
9a56a232 7800{
553deffd
PB
7801 struct io_submit_state *state = &ctx->submit_state;
7802
e126391c
PB
7803 if (unlikely(state->link.head))
7804 io_queue_sqe_fallback(state->link.head);
553deffd 7805 /* flush only after queuing links as they can generate completions */
c450178d 7806 io_submit_flush_completions(ctx);
27926b68
JA
7807 if (state->plug_started)
7808 blk_finish_plug(&state->plug);
9a56a232
JA
7809}
7810
7811/*
7812 * Start submission side cache.
7813 */
7814static void io_submit_state_start(struct io_submit_state *state,
ba88ff11 7815 unsigned int max_ios)
9a56a232 7816{
27926b68 7817 state->plug_started = false;
4b628aeb 7818 state->need_plug = max_ios > 2;
5ca7a8b3 7819 state->submit_nr = max_ios;
a1ab7b35
PB
7820 /* set only head, no need to init link_last in advance */
7821 state->link.head = NULL;
9a56a232
JA
7822}
7823
2b188cc1
JA
7824static void io_commit_sqring(struct io_ring_ctx *ctx)
7825{
75b28aff 7826 struct io_rings *rings = ctx->rings;
2b188cc1 7827
caf582c6
PB
7828 /*
7829 * Ensure any loads from the SQEs are done at this point,
7830 * since once we write the new head, the application could
7831 * write new data to them.
7832 */
7833 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
7834}
7835
2b188cc1 7836/*
dd9ae8a0 7837 * Fetch an sqe, if one is available. Note this returns a pointer to memory
2b188cc1
JA
7838 * that is mapped by userspace. This means that care needs to be taken to
7839 * ensure that reads are stable, as we cannot rely on userspace always
7840 * being a good citizen. If members of the sqe are validated and then later
7841 * used, it's important that those reads are done through READ_ONCE() to
7842 * prevent a re-load down the line.
7843 */
709b302f 7844static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2b188cc1 7845{
ea5ab3b5 7846 unsigned head, mask = ctx->sq_entries - 1;
17d3aeb3 7847 unsigned sq_idx = ctx->cached_sq_head++ & mask;
2b188cc1
JA
7848
7849 /*
7850 * The cached sq head (or cq tail) serves two purposes:
7851 *
7852 * 1) allows us to batch the cost of updating the user visible
7853 * head updates.
7854 * 2) allows the kernel side to track the head on its own, even
7855 * though the application is the one updating it.
7856 */
17d3aeb3 7857 head = READ_ONCE(ctx->sq_array[sq_idx]);
709b302f
PB
7858 if (likely(head < ctx->sq_entries))
7859 return &ctx->sq_sqes[head];
2b188cc1
JA
7860
7861 /* drop invalid entries */
15641e42
PB
7862 ctx->cq_extra--;
7863 WRITE_ONCE(ctx->rings->sq_dropped,
7864 READ_ONCE(ctx->rings->sq_dropped) + 1);
709b302f
PB
7865 return NULL;
7866}
7867
0f212204 7868static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
282cdc86 7869 __must_hold(&ctx->uring_lock)
6c271ce2 7870{
69629809 7871 unsigned int entries = io_sqring_entries(ctx);
8e6971a8
PB
7872 unsigned int left;
7873 int ret;
6c271ce2 7874
51d48dab 7875 if (unlikely(!entries))
69629809 7876 return 0;
ee7d46d9 7877 /* make sure SQ entry isn't read before tail */
8e6971a8
PB
7878 ret = left = min3(nr, ctx->sq_entries, entries);
7879 io_get_task_refs(left);
7880 io_submit_state_start(&ctx->submit_state, left);
6c271ce2 7881
69629809 7882 do {
3529d8c2 7883 const struct io_uring_sqe *sqe;
196be95c 7884 struct io_kiocb *req;
fb5ccc98 7885
8e6971a8 7886 if (unlikely(!io_alloc_req_refill(ctx)))
fb5ccc98 7887 break;
a33ae9ce 7888 req = io_alloc_req(ctx);
4fccfcbb
PB
7889 sqe = io_get_sqe(ctx);
7890 if (unlikely(!sqe)) {
fa05457a 7891 io_req_add_to_cache(req, ctx);
4fccfcbb
PB
7892 break;
7893 }
1cd15904
PB
7894
7895 /*
7896 * Continue submitting even for sqe failure if the
7897 * ring was setup with IORING_SETUP_SUBMIT_ALL
7898 */
7899 if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
7900 !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
7901 left--;
7902 break;
bcbb7bf6 7903 }
1cd15904 7904 } while (--left);
9466f437 7905
8e6971a8
PB
7906 if (unlikely(left)) {
7907 ret -= left;
7908 /* try again if it submitted nothing and can't allocate a req */
7909 if (!ret && io_req_cache_empty(ctx))
7910 ret = -EAGAIN;
7911 current->io_uring->cached_refs += left;
9466f437 7912 }
6c271ce2 7913
553deffd 7914 io_submit_state_end(ctx);
ae9428ca
PB
7915 /* Commit SQ ring head once we've consumed and submitted all SQEs */
7916 io_commit_sqring(ctx);
8e6971a8 7917 return ret;
6c271ce2
JA
7918}
7919
e4b6d902
PB
7920static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7921{
7922 return READ_ONCE(sqd->state);
7923}
7924
23b3628e
XW
7925static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7926{
7927 /* Tell userspace we may need a wakeup call */
79ebeaee 7928 spin_lock(&ctx->completion_lock);
20c0b380
NA
7929 WRITE_ONCE(ctx->rings->sq_flags,
7930 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
79ebeaee 7931 spin_unlock(&ctx->completion_lock);
23b3628e
XW
7932}
7933
7934static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
7935{
79ebeaee 7936 spin_lock(&ctx->completion_lock);
20c0b380
NA
7937 WRITE_ONCE(ctx->rings->sq_flags,
7938 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
79ebeaee 7939 spin_unlock(&ctx->completion_lock);
23b3628e
XW
7940}
7941
08369246 7942static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6c271ce2 7943{
c8d1ba58 7944 unsigned int to_submit;
bdcd3eab 7945 int ret = 0;
6c271ce2 7946
c8d1ba58 7947 to_submit = io_sqring_entries(ctx);
e95eee2d 7948 /* if we're handling multiple rings, cap submit size for fairness */
4ce8ad95
OL
7949 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7950 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
e95eee2d 7951
5eef4e87 7952 if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
948e1947
PB
7953 const struct cred *creds = NULL;
7954
7955 if (ctx->sq_creds != current_cred())
7956 creds = override_creds(ctx->sq_creds);
a4c0b3de 7957
c8d1ba58 7958 mutex_lock(&ctx->uring_lock);
5eef4e87 7959 if (!wq_list_empty(&ctx->iopoll_list))
5ba3c874 7960 io_do_iopoll(ctx, true);
906a3c6f 7961
3b763ba1
PB
7962 /*
7963 * Don't submit if refs are dying, good for io_uring_register(),
7964 * but also it is relied upon by io_ring_exit_work()
7965 */
0298ef96
PB
7966 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7967 !(ctx->flags & IORING_SETUP_R_DISABLED))
08369246 7968 ret = io_submit_sqes(ctx, to_submit);
c8d1ba58 7969 mutex_unlock(&ctx->uring_lock);
cb318216 7970
acfb381d
PB
7971 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7972 wake_up(&ctx->sqo_sq_wait);
948e1947
PB
7973 if (creds)
7974 revert_creds(creds);
acfb381d 7975 }
6c271ce2 7976
08369246
XW
7977 return ret;
7978}
6c271ce2 7979
c072481d 7980static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
08369246
XW
7981{
7982 struct io_ring_ctx *ctx;
7983 unsigned sq_thread_idle = 0;
6c271ce2 7984
c9dca27d
PB
7985 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7986 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
08369246 7987 sqd->sq_thread_idle = sq_thread_idle;
c8d1ba58 7988}
6c271ce2 7989
e4b6d902
PB
7990static bool io_sqd_handle_event(struct io_sq_data *sqd)
7991{
7992 bool did_sig = false;
7993 struct ksignal ksig;
7994
7995 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7996 signal_pending(current)) {
7997 mutex_unlock(&sqd->lock);
7998 if (signal_pending(current))
7999 did_sig = get_signal(&ksig);
8000 cond_resched();
8001 mutex_lock(&sqd->lock);
8002 }
e4b6d902
PB
8003 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
8004}
8005
c8d1ba58
JA
8006static int io_sq_thread(void *data)
8007{
69fb2131
JA
8008 struct io_sq_data *sqd = data;
8009 struct io_ring_ctx *ctx;
a0d9205f 8010 unsigned long timeout = 0;
37d1e2e3 8011 char buf[TASK_COMM_LEN];
08369246 8012 DEFINE_WAIT(wait);
6c271ce2 8013
696ee88a 8014 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
37d1e2e3 8015 set_task_comm(current, buf);
37d1e2e3
JA
8016
8017 if (sqd->sq_cpu != -1)
8018 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
8019 else
8020 set_cpus_allowed_ptr(current, cpu_online_mask);
8021 current->flags |= PF_NO_SETAFFINITY;
8022
5bd2182d
PM
8023 audit_alloc_kernel(current);
8024
09a6f4ef 8025 mutex_lock(&sqd->lock);
e4b6d902 8026 while (1) {
1a924a80 8027 bool cap_entries, sqt_spin = false;
c1edbf5f 8028
e4b6d902
PB
8029 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
8030 if (io_sqd_handle_event(sqd))
c7d95613 8031 break;
08369246
XW
8032 timeout = jiffies + sqd->sq_thread_idle;
8033 }
e4b6d902 8034
e95eee2d 8035 cap_entries = !list_is_singular(&sqd->ctx_list);
69fb2131 8036 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
948e1947 8037 int ret = __io_sq_thread(ctx, cap_entries);
7c30f36a 8038
5eef4e87 8039 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
08369246 8040 sqt_spin = true;
69fb2131 8041 }
dd432ea5
PB
8042 if (io_run_task_work())
8043 sqt_spin = true;
6c271ce2 8044
08369246 8045 if (sqt_spin || !time_after(jiffies, timeout)) {
c8d1ba58 8046 cond_resched();
08369246
XW
8047 if (sqt_spin)
8048 timeout = jiffies + sqd->sq_thread_idle;
8049 continue;
8050 }
8051
08369246 8052 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7f62d40d 8053 if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
1a924a80
PB
8054 bool needs_sched = true;
8055
724cb4f9 8056 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
aaa9f0f4
PB
8057 io_ring_set_wakeup_flag(ctx);
8058
724cb4f9 8059 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
5eef4e87 8060 !wq_list_empty(&ctx->iopoll_list)) {
724cb4f9
HX
8061 needs_sched = false;
8062 break;
8063 }
649bb75d
AK
8064
8065 /*
8066 * Ensure the store of the wakeup flag is not
8067 * reordered with the load of the SQ tail
8068 */
8069 smp_mb();
8070
724cb4f9
HX
8071 if (io_sqring_entries(ctx)) {
8072 needs_sched = false;
8073 break;
8074 }
8075 }
8076
8077 if (needs_sched) {
8078 mutex_unlock(&sqd->lock);
8079 schedule();
8080 mutex_lock(&sqd->lock);
8081 }
69fb2131
JA
8082 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
8083 io_ring_clear_wakeup_flag(ctx);
6c271ce2 8084 }
08369246
XW
8085
8086 finish_wait(&sqd->wait, &wait);
8087 timeout = jiffies + sqd->sq_thread_idle;
6c271ce2 8088 }
28cea78a 8089
78cc687b 8090 io_uring_cancel_generic(true, sqd);
37d1e2e3 8091 sqd->thread = NULL;
05962f95 8092 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
5f3f26f9 8093 io_ring_set_wakeup_flag(ctx);
521d6a73 8094 io_run_task_work();
734551df
PB
8095 mutex_unlock(&sqd->lock);
8096
5bd2182d
PM
8097 audit_free(current);
8098
37d1e2e3
JA
8099 complete(&sqd->exited);
8100 do_exit(0);
6c271ce2
JA
8101}
8102
bda52162
JA
8103struct io_wait_queue {
8104 struct wait_queue_entry wq;
8105 struct io_ring_ctx *ctx;
5fd46178 8106 unsigned cq_tail;
bda52162
JA
8107 unsigned nr_timeouts;
8108};
8109
6c503150 8110static inline bool io_should_wake(struct io_wait_queue *iowq)
bda52162
JA
8111{
8112 struct io_ring_ctx *ctx = iowq->ctx;
5fd46178 8113 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
bda52162
JA
8114
8115 /*
d195a66e 8116 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
8117 * started waiting. For timeouts, we always want to return to userspace,
8118 * regardless of event count.
8119 */
5fd46178 8120 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
bda52162
JA
8121}
8122
8123static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
8124 int wake_flags, void *key)
8125{
8126 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
8127 wq);
8128
6c503150
PB
8129 /*
8130 * Cannot safely flush overflowed CQEs from here, ensure we wake up
8131 * the task, and the next invocation will do it.
8132 */
5ed7a37d 8133 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
6c503150
PB
8134 return autoremove_wake_function(curr, mode, wake_flags, key);
8135 return -1;
bda52162
JA
8136}
8137
af9c1a44
JA
8138static int io_run_task_work_sig(void)
8139{
8140 if (io_run_task_work())
8141 return 1;
0b8cfa97 8142 if (test_thread_flag(TIF_NOTIFY_SIGNAL))
792ee0f6 8143 return -ERESTARTSYS;
c5020bc8
OL
8144 if (task_sigpending(current))
8145 return -EINTR;
8146 return 0;
af9c1a44
JA
8147}
8148
eeb60b9a
PB
8149/* when returns >0, the caller should retry */
8150static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
8151 struct io_wait_queue *iowq,
22833966 8152 ktime_t timeout)
eeb60b9a
PB
8153{
8154 int ret;
8155
8156 /* make sure we run task_work before checking for signals */
8157 ret = io_run_task_work_sig();
8158 if (ret || io_should_wake(iowq))
8159 return ret;
8160 /* let the caller flush overflows, retry */
5ed7a37d 8161 if (test_bit(0, &ctx->check_cq_overflow))
eeb60b9a
PB
8162 return 1;
8163
22833966
JA
8164 if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
8165 return -ETIME;
8166 return 1;
eeb60b9a
PB
8167}
8168
2b188cc1
JA
8169/*
8170 * Wait until events become available, if we don't already have some. The
8171 * application must reap them itself, as they reside on the shared cq ring.
8172 */
8173static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
c73ebb68
HX
8174 const sigset_t __user *sig, size_t sigsz,
8175 struct __kernel_timespec __user *uts)
2b188cc1 8176{
90291099 8177 struct io_wait_queue iowq;
75b28aff 8178 struct io_rings *rings = ctx->rings;
22833966 8179 ktime_t timeout = KTIME_MAX;
c1d5a224 8180 int ret;
2b188cc1 8181
b41e9852 8182 do {
90f67366 8183 io_cqring_overflow_flush(ctx);
6c503150 8184 if (io_cqring_events(ctx) >= min_events)
b41e9852 8185 return 0;
4c6e277c 8186 if (!io_run_task_work())
b41e9852 8187 break;
b41e9852 8188 } while (1);
2b188cc1
JA
8189
8190 if (sig) {
9e75ad5d
AB
8191#ifdef CONFIG_COMPAT
8192 if (in_compat_syscall())
8193 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 8194 sigsz);
9e75ad5d
AB
8195 else
8196#endif
b772434b 8197 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 8198
2b188cc1
JA
8199 if (ret)
8200 return ret;
8201 }
8202
950e79dd
OL
8203 if (uts) {
8204 struct timespec64 ts;
8205
8206 if (get_timespec64(&ts, uts))
8207 return -EFAULT;
8208 timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
8209 }
8210
90291099
PB
8211 init_waitqueue_func_entry(&iowq.wq, io_wake_function);
8212 iowq.wq.private = current;
8213 INIT_LIST_HEAD(&iowq.wq.entry);
8214 iowq.ctx = ctx;
bda52162 8215 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
5fd46178 8216 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
90291099 8217
c826bd7a 8218 trace_io_uring_cqring_wait(ctx, min_events);
bda52162 8219 do {
ca0a2651 8220 /* if we can't even flush overflow, don't wait for more */
90f67366 8221 if (!io_cqring_overflow_flush(ctx)) {
ca0a2651
JA
8222 ret = -EBUSY;
8223 break;
8224 }
311997b3 8225 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
bda52162 8226 TASK_INTERRUPTIBLE);
22833966 8227 ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
ca0a2651 8228 cond_resched();
eeb60b9a 8229 } while (ret > 0);
bda52162 8230
b4f20bb4 8231 finish_wait(&ctx->cq_wait, &iowq.wq);
b7db41c9 8232 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 8233
75b28aff 8234 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
8235}
8236
9123c8ff 8237static void io_free_page_table(void **table, size_t size)
05f3fb3c 8238{
9123c8ff 8239 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
05f3fb3c 8240
846a4ef2 8241 for (i = 0; i < nr_tables; i++)
9123c8ff
PB
8242 kfree(table[i]);
8243 kfree(table);
8244}
8245
c072481d 8246static __cold void **io_alloc_page_table(size_t size)
9123c8ff
PB
8247{
8248 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
8249 size_t init_size = size;
8250 void **table;
8251
0bea96f5 8252 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
9123c8ff
PB
8253 if (!table)
8254 return NULL;
8255
8256 for (i = 0; i < nr_tables; i++) {
27f6b318 8257 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
9123c8ff 8258
0bea96f5 8259 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
9123c8ff
PB
8260 if (!table[i]) {
8261 io_free_page_table(table, init_size);
8262 return NULL;
8263 }
8264 size -= this_size;
8265 }
8266 return table;
05f3fb3c
JA
8267}
8268
28a9fe25 8269static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
1642b445 8270{
28a9fe25
PB
8271 percpu_ref_exit(&ref_node->refs);
8272 kfree(ref_node);
1642b445
PB
8273}
8274
c072481d 8275static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
b9bd2bea
PB
8276{
8277 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
8278 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
8279 unsigned long flags;
8280 bool first_add = false;
b36a2050 8281 unsigned long delay = HZ;
b9bd2bea
PB
8282
8283 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
8284 node->done = true;
8285
b36a2050
DY
8286 /* if we are mid-quiesce then do not delay */
8287 if (node->rsrc_data->quiesce)
8288 delay = 0;
8289
b9bd2bea
PB
8290 while (!list_empty(&ctx->rsrc_ref_list)) {
8291 node = list_first_entry(&ctx->rsrc_ref_list,
8292 struct io_rsrc_node, node);
8293 /* recycle ref nodes in order */
8294 if (!node->done)
8295 break;
8296 list_del(&node->node);
8297 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
8298 }
8299 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
8300
8301 if (first_add)
b36a2050 8302 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
b9bd2bea
PB
8303}
8304
f6133fbd 8305static struct io_rsrc_node *io_rsrc_node_alloc(void)
b9bd2bea
PB
8306{
8307 struct io_rsrc_node *ref_node;
8308
8309 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
8310 if (!ref_node)
8311 return NULL;
8312
8313 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
8314 0, GFP_KERNEL)) {
8315 kfree(ref_node);
8316 return NULL;
8317 }
8318 INIT_LIST_HEAD(&ref_node->node);
8319 INIT_LIST_HEAD(&ref_node->rsrc_list);
8320 ref_node->done = false;
8321 return ref_node;
8322}
8323
a7f0ed5a
PB
8324static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
8325 struct io_rsrc_data *data_to_kill)
ab409402 8326 __must_hold(&ctx->uring_lock)
6b06314c 8327{
a7f0ed5a
PB
8328 WARN_ON_ONCE(!ctx->rsrc_backup_node);
8329 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
6b06314c 8330
ab409402
PB
8331 io_rsrc_refs_drop(ctx);
8332
a7f0ed5a
PB
8333 if (data_to_kill) {
8334 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
82fbcfa9 8335
a7f0ed5a 8336 rsrc_node->rsrc_data = data_to_kill;
4956b9ea 8337 spin_lock_irq(&ctx->rsrc_ref_lock);
a7f0ed5a 8338 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
4956b9ea 8339 spin_unlock_irq(&ctx->rsrc_ref_lock);
82fbcfa9 8340
3e942498 8341 atomic_inc(&data_to_kill->refs);
a7f0ed5a
PB
8342 percpu_ref_kill(&rsrc_node->refs);
8343 ctx->rsrc_node = NULL;
8344 }
6b06314c 8345
a7f0ed5a
PB
8346 if (!ctx->rsrc_node) {
8347 ctx->rsrc_node = ctx->rsrc_backup_node;
8348 ctx->rsrc_backup_node = NULL;
8349 }
8bad28d8
HX
8350}
8351
a7f0ed5a 8352static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
8dd03afe
PB
8353{
8354 if (ctx->rsrc_backup_node)
8355 return 0;
f6133fbd 8356 ctx->rsrc_backup_node = io_rsrc_node_alloc();
8dd03afe 8357 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
8bad28d8
HX
8358}
8359
c072481d
PB
8360static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
8361 struct io_ring_ctx *ctx)
8bad28d8
HX
8362{
8363 int ret;
05589553 8364
215c3902 8365 /* As we may drop ->uring_lock, other task may have started quiesce */
8bad28d8
HX
8366 if (data->quiesce)
8367 return -ENXIO;
05589553 8368
8bad28d8 8369 data->quiesce = true;
1ffc5422 8370 do {
a7f0ed5a 8371 ret = io_rsrc_node_switch_start(ctx);
8dd03afe 8372 if (ret)
f2303b1f 8373 break;
a7f0ed5a 8374 io_rsrc_node_switch(ctx, data);
f2303b1f 8375
3e942498
PB
8376 /* kill initial ref, already quiesced if zero */
8377 if (atomic_dec_and_test(&data->refs))
8378 break;
c018db4a 8379 mutex_unlock(&ctx->uring_lock);
8bad28d8 8380 flush_delayed_work(&ctx->rsrc_put_work);
1ffc5422 8381 ret = wait_for_completion_interruptible(&data->done);
c018db4a
JA
8382 if (!ret) {
8383 mutex_lock(&ctx->uring_lock);
80912cef
DY
8384 if (atomic_read(&data->refs) > 0) {
8385 /*
8386 * it has been revived by another thread while
8387 * we were unlocked
8388 */
8389 mutex_unlock(&ctx->uring_lock);
8390 } else {
8391 break;
8392 }
c018db4a 8393 }
8bad28d8 8394
3e942498
PB
8395 atomic_inc(&data->refs);
8396 /* wait for all works potentially completing data->done */
8397 flush_delayed_work(&ctx->rsrc_put_work);
cb5e1b81 8398 reinit_completion(&data->done);
8dd03afe 8399
1ffc5422 8400 ret = io_run_task_work_sig();
8bad28d8 8401 mutex_lock(&ctx->uring_lock);
f2303b1f 8402 } while (ret >= 0);
8bad28d8 8403 data->quiesce = false;
05f3fb3c 8404
8bad28d8 8405 return ret;
d7954b2b
BM
8406}
8407
2d091d62
PB
8408static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
8409{
8410 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
8411 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
8412
8413 return &data->tags[table_idx][off];
8414}
8415
44b31f2f 8416static void io_rsrc_data_free(struct io_rsrc_data *data)
1ad555c6 8417{
2d091d62
PB
8418 size_t size = data->nr * sizeof(data->tags[0][0]);
8419
8420 if (data->tags)
8421 io_free_page_table((void **)data->tags, size);
44b31f2f
PB
8422 kfree(data);
8423}
8424
c072481d
PB
8425static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
8426 u64 __user *utags, unsigned nr,
8427 struct io_rsrc_data **pdata)
1ad555c6 8428{
b895c9a6 8429 struct io_rsrc_data *data;
2d091d62 8430 int ret = -ENOMEM;
d878c816 8431 unsigned i;
1ad555c6
BM
8432
8433 data = kzalloc(sizeof(*data), GFP_KERNEL);
8434 if (!data)
d878c816 8435 return -ENOMEM;
2d091d62 8436 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
b60c8dce 8437 if (!data->tags) {
1ad555c6 8438 kfree(data);
d878c816
PB
8439 return -ENOMEM;
8440 }
2d091d62
PB
8441
8442 data->nr = nr;
8443 data->ctx = ctx;
8444 data->do_put = do_put;
d878c816 8445 if (utags) {
2d091d62 8446 ret = -EFAULT;
d878c816 8447 for (i = 0; i < nr; i++) {
fdd1dc31
CIK
8448 u64 *tag_slot = io_get_tag_slot(data, i);
8449
8450 if (copy_from_user(tag_slot, &utags[i],
8451 sizeof(*tag_slot)))
2d091d62 8452 goto fail;
d878c816 8453 }
1ad555c6 8454 }
b60c8dce 8455
3e942498 8456 atomic_set(&data->refs, 1);
1ad555c6 8457 init_completion(&data->done);
d878c816
PB
8458 *pdata = data;
8459 return 0;
2d091d62
PB
8460fail:
8461 io_rsrc_data_free(data);
8462 return ret;
1ad555c6
BM
8463}
8464
9123c8ff
PB
8465static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
8466{
0bea96f5
PB
8467 table->files = kvcalloc(nr_files, sizeof(table->files[0]),
8468 GFP_KERNEL_ACCOUNT);
9123c8ff
PB
8469 return !!table->files;
8470}
8471
042b0d85 8472static void io_free_file_tables(struct io_file_table *table)
9123c8ff 8473{
042b0d85 8474 kvfree(table->files);
9123c8ff
PB
8475 table->files = NULL;
8476}
8477
fff4db76 8478static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
1ad555c6 8479{
1f59bc0f
PB
8480 int i;
8481
8482 for (i = 0; i < ctx->nr_user_files; i++) {
8483 struct file *file = io_file_from_index(ctx, i);
8484
8485 if (!file || io_file_need_scm(file))
8486 continue;
8487 io_fixed_file_slot(&ctx->file_table, i)->file_ptr = 0;
8488 fput(file);
8489 }
8490
fff4db76
PB
8491#if defined(CONFIG_UNIX)
8492 if (ctx->ring_sock) {
8493 struct sock *sock = ctx->ring_sock->sk;
8494 struct sk_buff *skb;
8495
8496 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
8497 kfree_skb(skb);
8498 }
fff4db76 8499#endif
042b0d85 8500 io_free_file_tables(&ctx->file_table);
44b31f2f 8501 io_rsrc_data_free(ctx->file_data);
fff4db76
PB
8502 ctx->file_data = NULL;
8503 ctx->nr_user_files = 0;
1ad555c6
BM
8504}
8505
d7954b2b
BM
8506static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
8507{
d7954b2b
BM
8508 int ret;
8509
08480400 8510 if (!ctx->file_data)
d7954b2b 8511 return -ENXIO;
08480400
PB
8512 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
8513 if (!ret)
8514 __io_sqe_files_unregister(ctx);
8515 return ret;
6b06314c
JA
8516}
8517
37d1e2e3 8518static void io_sq_thread_unpark(struct io_sq_data *sqd)
09a6f4ef 8519 __releases(&sqd->lock)
37d1e2e3 8520{
521d6a73
PB
8521 WARN_ON_ONCE(sqd->thread == current);
8522
9e138a48
PB
8523 /*
8524 * Do the dance but not conditional clear_bit() because it'd race with
8525 * other threads incrementing park_pending and setting the bit.
8526 */
37d1e2e3 8527 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
9e138a48
PB
8528 if (atomic_dec_return(&sqd->park_pending))
8529 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 8530 mutex_unlock(&sqd->lock);
37d1e2e3
JA
8531}
8532
86e0d676 8533static void io_sq_thread_park(struct io_sq_data *sqd)
09a6f4ef 8534 __acquires(&sqd->lock)
37d1e2e3 8535{
521d6a73
PB
8536 WARN_ON_ONCE(sqd->thread == current);
8537
9e138a48 8538 atomic_inc(&sqd->park_pending);
86e0d676 8539 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 8540 mutex_lock(&sqd->lock);
05962f95 8541 if (sqd->thread)
86e0d676 8542 wake_up_process(sqd->thread);
37d1e2e3
JA
8543}
8544
8545static void io_sq_thread_stop(struct io_sq_data *sqd)
8546{
521d6a73 8547 WARN_ON_ONCE(sqd->thread == current);
88885f66 8548 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
521d6a73 8549
05962f95 8550 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
88885f66 8551 mutex_lock(&sqd->lock);
e8f98f24
JA
8552 if (sqd->thread)
8553 wake_up_process(sqd->thread);
09a6f4ef 8554 mutex_unlock(&sqd->lock);
05962f95 8555 wait_for_completion(&sqd->exited);
37d1e2e3
JA
8556}
8557
534ca6d6 8558static void io_put_sq_data(struct io_sq_data *sqd)
6c271ce2 8559{
534ca6d6 8560 if (refcount_dec_and_test(&sqd->refs)) {
9e138a48
PB
8561 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
8562
37d1e2e3
JA
8563 io_sq_thread_stop(sqd);
8564 kfree(sqd);
8565 }
8566}
8567
8568static void io_sq_thread_finish(struct io_ring_ctx *ctx)
8569{
8570 struct io_sq_data *sqd = ctx->sq_data;
8571
8572 if (sqd) {
05962f95 8573 io_sq_thread_park(sqd);
521d6a73 8574 list_del_init(&ctx->sqd_list);
37d1e2e3 8575 io_sqd_update_thread_idle(sqd);
05962f95 8576 io_sq_thread_unpark(sqd);
37d1e2e3
JA
8577
8578 io_put_sq_data(sqd);
8579 ctx->sq_data = NULL;
534ca6d6
JA
8580 }
8581}
8582
aa06165d
JA
8583static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
8584{
8585 struct io_ring_ctx *ctx_attach;
8586 struct io_sq_data *sqd;
8587 struct fd f;
8588
8589 f = fdget(p->wq_fd);
8590 if (!f.file)
8591 return ERR_PTR(-ENXIO);
8592 if (f.file->f_op != &io_uring_fops) {
8593 fdput(f);
8594 return ERR_PTR(-EINVAL);
8595 }
8596
8597 ctx_attach = f.file->private_data;
8598 sqd = ctx_attach->sq_data;
8599 if (!sqd) {
8600 fdput(f);
8601 return ERR_PTR(-EINVAL);
8602 }
5c2469e0
JA
8603 if (sqd->task_tgid != current->tgid) {
8604 fdput(f);
8605 return ERR_PTR(-EPERM);
8606 }
aa06165d
JA
8607
8608 refcount_inc(&sqd->refs);
8609 fdput(f);
8610 return sqd;
8611}
8612
26984fbf
PB
8613static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
8614 bool *attached)
534ca6d6
JA
8615{
8616 struct io_sq_data *sqd;
8617
26984fbf 8618 *attached = false;
5c2469e0
JA
8619 if (p->flags & IORING_SETUP_ATTACH_WQ) {
8620 sqd = io_attach_sq_data(p);
26984fbf
PB
8621 if (!IS_ERR(sqd)) {
8622 *attached = true;
5c2469e0 8623 return sqd;
26984fbf 8624 }
5c2469e0
JA
8625 /* fall through for EPERM case, setup new sqd/task */
8626 if (PTR_ERR(sqd) != -EPERM)
8627 return sqd;
8628 }
aa06165d 8629
534ca6d6
JA
8630 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
8631 if (!sqd)
8632 return ERR_PTR(-ENOMEM);
8633
9e138a48 8634 atomic_set(&sqd->park_pending, 0);
534ca6d6 8635 refcount_set(&sqd->refs, 1);
69fb2131 8636 INIT_LIST_HEAD(&sqd->ctx_list);
09a6f4ef 8637 mutex_init(&sqd->lock);
534ca6d6 8638 init_waitqueue_head(&sqd->wait);
37d1e2e3 8639 init_completion(&sqd->exited);
534ca6d6
JA
8640 return sqd;
8641}
8642
6b06314c
JA
8643/*
8644 * Ensure the UNIX gc is aware of our file set, so we are certain that
8645 * the io_uring can be safely unregistered on process exit, even if we have
1f59bc0f
PB
8646 * loops in the file referencing. We account only files that can hold other
8647 * files because otherwise they can't form a loop and so are not interesting
8648 * for GC.
6b06314c 8649 */
8b3171bd 8650static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
6b06314c 8651{
73b25d3b 8652#if defined(CONFIG_UNIX)
6b06314c 8653 struct sock *sk = ctx->ring_sock->sk;
73b25d3b 8654 struct sk_buff_head *head = &sk->sk_receive_queue;
6b06314c
JA
8655 struct scm_fp_list *fpl;
8656 struct sk_buff *skb;
6b06314c 8657
73b25d3b
PB
8658 if (likely(!io_file_need_scm(file)))
8659 return 0;
8660
8661 /*
8662 * See if we can merge this file into an existing skb SCM_RIGHTS
8663 * file set. If there's no room, fall back to allocating a new skb
8664 * and filling it in.
8665 */
8666 spin_lock_irq(&head->lock);
8667 skb = skb_peek(head);
8668 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
8669 __skb_unlink(skb, head);
8670 else
8671 skb = NULL;
8672 spin_unlock_irq(&head->lock);
6b06314c 8673
6b06314c 8674 if (!skb) {
73b25d3b
PB
8675 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
8676 if (!fpl)
8677 return -ENOMEM;
6b06314c 8678
73b25d3b
PB
8679 skb = alloc_skb(0, GFP_KERNEL);
8680 if (!skb) {
8681 kfree(fpl);
8682 return -ENOMEM;
8683 }
6b06314c 8684
73b25d3b
PB
8685 fpl->user = get_uid(current_user());
8686 fpl->max = SCM_MAX_FD;
8687 fpl->count = 0;
dca58c6a 8688
73b25d3b
PB
8689 UNIXCB(skb).fp = fpl;
8690 skb->sk = sk;
8691 skb->destructor = unix_destruct_scm;
8692 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
8693 }
8694
8695 fpl = UNIXCB(skb).fp;
8696 fpl->fp[fpl->count++] = get_file(file);
8697 unix_inflight(fpl->user, file);
8698 skb_queue_head(head, skb);
dca58c6a 8699 fput(file);
73b25d3b 8700#endif
6b06314c
JA
8701 return 0;
8702}
6b06314c 8703
47e90392 8704static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
05f3fb3c 8705{
50238531 8706 struct file *file = prsrc->file;
05f3fb3c
JA
8707#if defined(CONFIG_UNIX)
8708 struct sock *sock = ctx->ring_sock->sk;
8709 struct sk_buff_head list, *head = &sock->sk_receive_queue;
8710 struct sk_buff *skb;
8711 int i;
8712
1f59bc0f
PB
8713 if (!io_file_need_scm(file)) {
8714 fput(file);
8715 return;
8716 }
8717
05f3fb3c
JA
8718 __skb_queue_head_init(&list);
8719
8720 /*
8721 * Find the skb that holds this file in its SCM_RIGHTS. When found,
8722 * remove this entry and rearrange the file array.
8723 */
8724 skb = skb_dequeue(head);
8725 while (skb) {
8726 struct scm_fp_list *fp;
8727
8728 fp = UNIXCB(skb).fp;
8729 for (i = 0; i < fp->count; i++) {
8730 int left;
8731
8732 if (fp->fp[i] != file)
8733 continue;
8734
8735 unix_notinflight(fp->user, fp->fp[i]);
8736 left = fp->count - 1 - i;
8737 if (left) {
8738 memmove(&fp->fp[i], &fp->fp[i + 1],
8739 left * sizeof(struct file *));
8740 }
8741 fp->count--;
8742 if (!fp->count) {
8743 kfree_skb(skb);
8744 skb = NULL;
8745 } else {
8746 __skb_queue_tail(&list, skb);
8747 }
8748 fput(file);
8749 file = NULL;
8750 break;
8751 }
8752
8753 if (!file)
8754 break;
8755
8756 __skb_queue_tail(&list, skb);
8757
8758 skb = skb_dequeue(head);
8759 }
8760
8761 if (skb_peek(&list)) {
8762 spin_lock_irq(&head->lock);
8763 while ((skb = __skb_dequeue(&list)) != NULL)
8764 __skb_queue_tail(head, skb);
8765 spin_unlock_irq(&head->lock);
8766 }
8767#else
8768 fput(file);
8769#endif
8770}
8771
b895c9a6 8772static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
65e19f54 8773{
b895c9a6 8774 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
269bbe5f
BM
8775 struct io_ring_ctx *ctx = rsrc_data->ctx;
8776 struct io_rsrc_put *prsrc, *tmp;
05589553 8777
269bbe5f
BM
8778 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8779 list_del(&prsrc->list);
b60c8dce
PB
8780
8781 if (prsrc->tag) {
f8929630
PB
8782 if (ctx->flags & IORING_SETUP_IOPOLL)
8783 mutex_lock(&ctx->uring_lock);
b60c8dce 8784
79ebeaee 8785 spin_lock(&ctx->completion_lock);
913a571a 8786 io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
b60c8dce 8787 io_commit_cqring(ctx);
79ebeaee 8788 spin_unlock(&ctx->completion_lock);
b60c8dce 8789 io_cqring_ev_posted(ctx);
f8929630
PB
8790
8791 if (ctx->flags & IORING_SETUP_IOPOLL)
8792 mutex_unlock(&ctx->uring_lock);
b60c8dce
PB
8793 }
8794
40ae0ff7 8795 rsrc_data->do_put(ctx, prsrc);
269bbe5f 8796 kfree(prsrc);
65e19f54 8797 }
05589553 8798
28a9fe25 8799 io_rsrc_node_destroy(ref_node);
3e942498
PB
8800 if (atomic_dec_and_test(&rsrc_data->refs))
8801 complete(&rsrc_data->done);
2faf852d 8802}
65e19f54 8803
269bbe5f 8804static void io_rsrc_put_work(struct work_struct *work)
4a38aed2
JA
8805{
8806 struct io_ring_ctx *ctx;
8807 struct llist_node *node;
8808
269bbe5f
BM
8809 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
8810 node = llist_del_all(&ctx->rsrc_put_llist);
4a38aed2
JA
8811
8812 while (node) {
b895c9a6 8813 struct io_rsrc_node *ref_node;
4a38aed2
JA
8814 struct llist_node *next = node->next;
8815
b895c9a6 8816 ref_node = llist_entry(node, struct io_rsrc_node, llist);
269bbe5f 8817 __io_rsrc_put_work(ref_node);
4a38aed2
JA
8818 node = next;
8819 }
8820}
8821
6b06314c 8822static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
792e3582 8823 unsigned nr_args, u64 __user *tags)
6b06314c
JA
8824{
8825 __s32 __user *fds = (__s32 __user *) arg;
05f3fb3c 8826 struct file *file;
f3baed39 8827 int fd, ret;
846a4ef2 8828 unsigned i;
6b06314c 8829
05f3fb3c 8830 if (ctx->file_data)
6b06314c
JA
8831 return -EBUSY;
8832 if (!nr_args)
8833 return -EINVAL;
8834 if (nr_args > IORING_MAX_FIXED_FILES)
8835 return -EMFILE;
3a1b8a4e
PB
8836 if (nr_args > rlimit(RLIMIT_NOFILE))
8837 return -EMFILE;
a7f0ed5a 8838 ret = io_rsrc_node_switch_start(ctx);
f3baed39
PB
8839 if (ret)
8840 return ret;
d878c816
PB
8841 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
8842 &ctx->file_data);
8843 if (ret)
8844 return ret;
6b06314c 8845
a03a2a20
PB
8846 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
8847 io_rsrc_data_free(ctx->file_data);
8848 ctx->file_data = NULL;
8849 return -ENOMEM;
8850 }
65e19f54 8851
08a45173 8852 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
a03a2a20
PB
8853 struct io_fixed_file *file_slot;
8854
d878c816 8855 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
600cf3f8 8856 ret = -EFAULT;
a03a2a20 8857 goto fail;
600cf3f8 8858 }
08a45173 8859 /* allow sparse sets */
792e3582
PB
8860 if (fd == -1) {
8861 ret = -EINVAL;
2d091d62 8862 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
a03a2a20 8863 goto fail;
08a45173 8864 continue;
792e3582 8865 }
6b06314c 8866
05f3fb3c 8867 file = fget(fd);
6b06314c 8868 ret = -EBADF;
792e3582 8869 if (unlikely(!file))
a03a2a20 8870 goto fail;
05f3fb3c 8871
6b06314c
JA
8872 /*
8873 * Don't allow io_uring instances to be registered. If UNIX
8874 * isn't enabled, then this causes a reference cycle and this
8875 * instance can never get freed. If UNIX is enabled we'll
8876 * handle it just fine, but there's still no point in allowing
8877 * a ring fd as it doesn't support regular read/write anyway.
8878 */
05f3fb3c
JA
8879 if (file->f_op == &io_uring_fops) {
8880 fput(file);
a03a2a20
PB
8881 goto fail;
8882 }
8b3171bd 8883 ret = io_scm_file_account(ctx, file);
a03a2a20 8884 if (ret) {
a03a2a20
PB
8885 fput(file);
8886 goto fail;
6b06314c 8887 }
e390510a
PB
8888 file_slot = io_fixed_file_slot(&ctx->file_table, i);
8889 io_fixed_file_set(file_slot, file);
05589553 8890 }
6b06314c 8891
a7f0ed5a 8892 io_rsrc_node_switch(ctx, NULL);
a03a2a20
PB
8893 return 0;
8894fail:
8895 __io_sqe_files_unregister(ctx);
6b06314c
JA
8896 return ret;
8897}
8898
9c7b0ba8
PB
8899static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8900 struct io_rsrc_node *node, void *rsrc)
8901{
8f0a2480 8902 u64 *tag_slot = io_get_tag_slot(data, idx);
9c7b0ba8
PB
8903 struct io_rsrc_put *prsrc;
8904
8905 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8906 if (!prsrc)
8907 return -ENOMEM;
8908
8f0a2480
PB
8909 prsrc->tag = *tag_slot;
8910 *tag_slot = 0;
9c7b0ba8
PB
8911 prsrc->rsrc = rsrc;
8912 list_add(&prsrc->list, &node->rsrc_list);
8913 return 0;
8914}
8915
b9445598
PB
8916static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8917 unsigned int issue_flags, u32 slot_index)
8918{
8919 struct io_ring_ctx *ctx = req->ctx;
9c7b0ba8 8920 bool needs_switch = false;
b9445598
PB
8921 struct io_fixed_file *file_slot;
8922 int ret = -EBADF;
8923
f8929630 8924 io_ring_submit_lock(ctx, issue_flags);
b9445598
PB
8925 if (file->f_op == &io_uring_fops)
8926 goto err;
8927 ret = -ENXIO;
8928 if (!ctx->file_data)
8929 goto err;
8930 ret = -EINVAL;
8931 if (slot_index >= ctx->nr_user_files)
8932 goto err;
8933
8934 slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8935 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
9c7b0ba8
PB
8936
8937 if (file_slot->file_ptr) {
8938 struct file *old_file;
8939
8940 ret = io_rsrc_node_switch_start(ctx);
8941 if (ret)
8942 goto err;
8943
8944 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8945 ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
8946 ctx->rsrc_node, old_file);
8947 if (ret)
8948 goto err;
8949 file_slot->file_ptr = 0;
8950 needs_switch = true;
8951 }
b9445598 8952
8b3171bd 8953 ret = io_scm_file_account(ctx, file);
e390510a
PB
8954 if (!ret) {
8955 *io_get_tag_slot(ctx->file_data, slot_index) = 0;
8956 io_fixed_file_set(file_slot, file);
b9445598 8957 }
b9445598 8958err:
9c7b0ba8
PB
8959 if (needs_switch)
8960 io_rsrc_node_switch(ctx, ctx->file_data);
f8929630 8961 io_ring_submit_unlock(ctx, issue_flags);
b9445598
PB
8962 if (ret)
8963 fput(file);
8964 return ret;
8965}
8966
7df778be
PB
8967static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
8968{
8969 unsigned int offset = req->close.file_slot - 1;
8970 struct io_ring_ctx *ctx = req->ctx;
8971 struct io_fixed_file *file_slot;
8972 struct file *file;
4cdd158b 8973 int ret;
7df778be 8974
f8929630 8975 io_ring_submit_lock(ctx, issue_flags);
7df778be
PB
8976 ret = -ENXIO;
8977 if (unlikely(!ctx->file_data))
8978 goto out;
8979 ret = -EINVAL;
8980 if (offset >= ctx->nr_user_files)
8981 goto out;
8982 ret = io_rsrc_node_switch_start(ctx);
8983 if (ret)
8984 goto out;
8985
4cdd158b
PB
8986 offset = array_index_nospec(offset, ctx->nr_user_files);
8987 file_slot = io_fixed_file_slot(&ctx->file_table, offset);
7df778be
PB
8988 ret = -EBADF;
8989 if (!file_slot->file_ptr)
8990 goto out;
8991
8992 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8993 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
8994 if (ret)
8995 goto out;
8996
8997 file_slot->file_ptr = 0;
8998 io_rsrc_node_switch(ctx, ctx->file_data);
8999 ret = 0;
9000out:
f8929630 9001 io_ring_submit_unlock(ctx, issue_flags);
7df778be
PB
9002 return ret;
9003}
9004
05f3fb3c 9005static int __io_sqe_files_update(struct io_ring_ctx *ctx,
c3bdad02 9006 struct io_uring_rsrc_update2 *up,
05f3fb3c
JA
9007 unsigned nr_args)
9008{
c3bdad02 9009 u64 __user *tags = u64_to_user_ptr(up->tags);
98f0b3b4 9010 __s32 __user *fds = u64_to_user_ptr(up->data);
b895c9a6 9011 struct io_rsrc_data *data = ctx->file_data;
a04b0ac0
PB
9012 struct io_fixed_file *file_slot;
9013 struct file *file;
98f0b3b4
PB
9014 int fd, i, err = 0;
9015 unsigned int done;
05589553 9016 bool needs_switch = false;
c3a31e60 9017
98f0b3b4
PB
9018 if (!ctx->file_data)
9019 return -ENXIO;
9020 if (up->offset + nr_args > ctx->nr_user_files)
c3a31e60
JA
9021 return -EINVAL;
9022
67973b93 9023 for (done = 0; done < nr_args; done++) {
c3bdad02
PB
9024 u64 tag = 0;
9025
9026 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
9027 copy_from_user(&fd, &fds[done], sizeof(fd))) {
c3a31e60
JA
9028 err = -EFAULT;
9029 break;
9030 }
c3bdad02
PB
9031 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
9032 err = -EINVAL;
9033 break;
9034 }
4e0377a1 9035 if (fd == IORING_REGISTER_FILES_SKIP)
9036 continue;
9037
67973b93 9038 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
aeca241b 9039 file_slot = io_fixed_file_slot(&ctx->file_table, i);
ea64ec02 9040
a04b0ac0
PB
9041 if (file_slot->file_ptr) {
9042 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
4cdd158b 9043 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
a5318d3c
HD
9044 if (err)
9045 break;
a04b0ac0 9046 file_slot->file_ptr = 0;
05589553 9047 needs_switch = true;
c3a31e60
JA
9048 }
9049 if (fd != -1) {
c3a31e60
JA
9050 file = fget(fd);
9051 if (!file) {
9052 err = -EBADF;
9053 break;
9054 }
9055 /*
9056 * Don't allow io_uring instances to be registered. If
9057 * UNIX isn't enabled, then this causes a reference
9058 * cycle and this instance can never get freed. If UNIX
9059 * is enabled we'll handle it just fine, but there's
9060 * still no point in allowing a ring fd as it doesn't
9061 * support regular read/write anyway.
9062 */
9063 if (file->f_op == &io_uring_fops) {
9064 fput(file);
9065 err = -EBADF;
9066 break;
9067 }
8b3171bd 9068 err = io_scm_file_account(ctx, file);
f3bd9dae
YY
9069 if (err) {
9070 fput(file);
c3a31e60 9071 break;
f3bd9dae 9072 }
e390510a
PB
9073 *io_get_tag_slot(data, i) = tag;
9074 io_fixed_file_set(file_slot, file);
c3a31e60 9075 }
05f3fb3c
JA
9076 }
9077
a7f0ed5a
PB
9078 if (needs_switch)
9079 io_rsrc_node_switch(ctx, data);
c3a31e60
JA
9080 return done ? done : err;
9081}
05589553 9082
685fe7fe
JA
9083static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
9084 struct task_struct *task)
24369c2e 9085{
e941894e 9086 struct io_wq_hash *hash;
24369c2e 9087 struct io_wq_data data;
24369c2e 9088 unsigned int concurrency;
24369c2e 9089
362a9e65 9090 mutex_lock(&ctx->uring_lock);
e941894e
JA
9091 hash = ctx->hash_map;
9092 if (!hash) {
9093 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
362a9e65
YY
9094 if (!hash) {
9095 mutex_unlock(&ctx->uring_lock);
e941894e 9096 return ERR_PTR(-ENOMEM);
362a9e65 9097 }
e941894e
JA
9098 refcount_set(&hash->refs, 1);
9099 init_waitqueue_head(&hash->wait);
9100 ctx->hash_map = hash;
24369c2e 9101 }
362a9e65 9102 mutex_unlock(&ctx->uring_lock);
24369c2e 9103
e941894e 9104 data.hash = hash;
685fe7fe 9105 data.task = task;
ebc11b6c 9106 data.free_work = io_wq_free_work;
f5fa38c5 9107 data.do_work = io_wq_submit_work;
24369c2e 9108
d25e3a3d
JA
9109 /* Do QD, or 4 * CPUS, whatever is smallest */
9110 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
24369c2e 9111
5aa75ed5 9112 return io_wq_create(concurrency, &data);
24369c2e
PB
9113}
9114
c072481d
PB
9115static __cold int io_uring_alloc_task_context(struct task_struct *task,
9116 struct io_ring_ctx *ctx)
0f212204
JA
9117{
9118 struct io_uring_task *tctx;
d8a6df10 9119 int ret;
0f212204 9120
09899b19 9121 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
0f212204
JA
9122 if (unlikely(!tctx))
9123 return -ENOMEM;
9124
e7a6c00d
JA
9125 tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
9126 sizeof(struct file *), GFP_KERNEL);
9127 if (unlikely(!tctx->registered_rings)) {
9128 kfree(tctx);
9129 return -ENOMEM;
9130 }
9131
d8a6df10
JA
9132 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
9133 if (unlikely(ret)) {
e7a6c00d 9134 kfree(tctx->registered_rings);
d8a6df10
JA
9135 kfree(tctx);
9136 return ret;
9137 }
9138
685fe7fe 9139 tctx->io_wq = io_init_wq_offload(ctx, task);
5aa75ed5
JA
9140 if (IS_ERR(tctx->io_wq)) {
9141 ret = PTR_ERR(tctx->io_wq);
9142 percpu_counter_destroy(&tctx->inflight);
e7a6c00d 9143 kfree(tctx->registered_rings);
5aa75ed5
JA
9144 kfree(tctx);
9145 return ret;
9146 }
9147
0f212204
JA
9148 xa_init(&tctx->xa);
9149 init_waitqueue_head(&tctx->wait);
fdaf083c 9150 atomic_set(&tctx->in_idle, 0);
0f212204 9151 task->io_uring = tctx;
7cbf1722
JA
9152 spin_lock_init(&tctx->task_lock);
9153 INIT_WQ_LIST(&tctx->task_list);
4813c377 9154 INIT_WQ_LIST(&tctx->prior_task_list);
7cbf1722 9155 init_task_work(&tctx->task_work, tctx_task_work);
0f212204
JA
9156 return 0;
9157}
9158
9159void __io_uring_free(struct task_struct *tsk)
9160{
9161 struct io_uring_task *tctx = tsk->io_uring;
9162
9163 WARN_ON_ONCE(!xa_empty(&tctx->xa));
ef8eaa4e 9164 WARN_ON_ONCE(tctx->io_wq);
09899b19 9165 WARN_ON_ONCE(tctx->cached_refs);
ef8eaa4e 9166
e7a6c00d 9167 kfree(tctx->registered_rings);
d8a6df10 9168 percpu_counter_destroy(&tctx->inflight);
0f212204
JA
9169 kfree(tctx);
9170 tsk->io_uring = NULL;
9171}
9172
c072481d
PB
9173static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
9174 struct io_uring_params *p)
2b188cc1
JA
9175{
9176 int ret;
9177
d25e3a3d
JA
9178 /* Retain compatibility with failing for an invalid attach attempt */
9179 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
9180 IORING_SETUP_ATTACH_WQ) {
9181 struct fd f;
9182
9183 f = fdget(p->wq_fd);
9184 if (!f.file)
9185 return -ENXIO;
0cc936f7
JA
9186 if (f.file->f_op != &io_uring_fops) {
9187 fdput(f);
f2a48dd0 9188 return -EINVAL;
0cc936f7
JA
9189 }
9190 fdput(f);
d25e3a3d 9191 }
6c271ce2 9192 if (ctx->flags & IORING_SETUP_SQPOLL) {
46fe18b1 9193 struct task_struct *tsk;
534ca6d6 9194 struct io_sq_data *sqd;
26984fbf 9195 bool attached;
534ca6d6 9196
cdc1404a
PM
9197 ret = security_uring_sqpoll();
9198 if (ret)
9199 return ret;
9200
26984fbf 9201 sqd = io_get_sq_data(p, &attached);
534ca6d6
JA
9202 if (IS_ERR(sqd)) {
9203 ret = PTR_ERR(sqd);
9204 goto err;
9205 }
69fb2131 9206
7c30f36a 9207 ctx->sq_creds = get_current_cred();
534ca6d6 9208 ctx->sq_data = sqd;
917257da
JA
9209 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
9210 if (!ctx->sq_thread_idle)
9211 ctx->sq_thread_idle = HZ;
9212
78d7f6ba 9213 io_sq_thread_park(sqd);
de75a3d3
PB
9214 list_add(&ctx->sqd_list, &sqd->ctx_list);
9215 io_sqd_update_thread_idle(sqd);
26984fbf 9216 /* don't attach to a dying SQPOLL thread, would be racy */
f2a48dd0 9217 ret = (attached && !sqd->thread) ? -ENXIO : 0;
78d7f6ba
PB
9218 io_sq_thread_unpark(sqd);
9219
de75a3d3
PB
9220 if (ret < 0)
9221 goto err;
9222 if (attached)
5aa75ed5 9223 return 0;
aa06165d 9224
6c271ce2 9225 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 9226 int cpu = p->sq_thread_cpu;
6c271ce2 9227
917257da 9228 ret = -EINVAL;
f2a48dd0 9229 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
e8f98f24 9230 goto err_sqpoll;
37d1e2e3 9231 sqd->sq_cpu = cpu;
6c271ce2 9232 } else {
37d1e2e3 9233 sqd->sq_cpu = -1;
6c271ce2 9234 }
37d1e2e3
JA
9235
9236 sqd->task_pid = current->pid;
5c2469e0 9237 sqd->task_tgid = current->tgid;
46fe18b1
JA
9238 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
9239 if (IS_ERR(tsk)) {
9240 ret = PTR_ERR(tsk);
e8f98f24 9241 goto err_sqpoll;
6c271ce2 9242 }
97a73a0f 9243
46fe18b1 9244 sqd->thread = tsk;
97a73a0f 9245 ret = io_uring_alloc_task_context(tsk, ctx);
46fe18b1 9246 wake_up_new_task(tsk);
0f212204
JA
9247 if (ret)
9248 goto err;
6c271ce2
JA
9249 } else if (p->flags & IORING_SETUP_SQ_AFF) {
9250 /* Can't have SQ_AFF without SQPOLL */
9251 ret = -EINVAL;
9252 goto err;
9253 }
9254
2b188cc1 9255 return 0;
f2a48dd0
PB
9256err_sqpoll:
9257 complete(&ctx->sq_data->exited);
2b188cc1 9258err:
37d1e2e3 9259 io_sq_thread_finish(ctx);
2b188cc1
JA
9260 return ret;
9261}
9262
a087e2b5
BM
9263static inline void __io_unaccount_mem(struct user_struct *user,
9264 unsigned long nr_pages)
2b188cc1
JA
9265{
9266 atomic_long_sub(nr_pages, &user->locked_vm);
9267}
9268
a087e2b5
BM
9269static inline int __io_account_mem(struct user_struct *user,
9270 unsigned long nr_pages)
2b188cc1
JA
9271{
9272 unsigned long page_limit, cur_pages, new_pages;
9273
9274 /* Don't allow more pages than we can safely lock */
9275 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
9276
9277 do {
9278 cur_pages = atomic_long_read(&user->locked_vm);
9279 new_pages = cur_pages + nr_pages;
9280 if (new_pages > page_limit)
9281 return -ENOMEM;
9282 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
9283 new_pages) != cur_pages);
9284
9285 return 0;
9286}
9287
26bfa89e 9288static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 9289{
62e398be 9290 if (ctx->user)
a087e2b5 9291 __io_unaccount_mem(ctx->user, nr_pages);
30975825 9292
26bfa89e
JA
9293 if (ctx->mm_account)
9294 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
9295}
9296
26bfa89e 9297static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 9298{
30975825
BM
9299 int ret;
9300
62e398be 9301 if (ctx->user) {
30975825
BM
9302 ret = __io_account_mem(ctx->user, nr_pages);
9303 if (ret)
9304 return ret;
9305 }
9306
26bfa89e
JA
9307 if (ctx->mm_account)
9308 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
9309
9310 return 0;
9311}
9312
2b188cc1
JA
9313static void io_mem_free(void *ptr)
9314{
52e04ef4
MR
9315 struct page *page;
9316
9317 if (!ptr)
9318 return;
2b188cc1 9319
52e04ef4 9320 page = virt_to_head_page(ptr);
2b188cc1
JA
9321 if (put_page_testzero(page))
9322 free_compound_page(page);
9323}
9324
9325static void *io_mem_alloc(size_t size)
9326{
0a3f1e0b 9327 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
2b188cc1 9328
0a3f1e0b 9329 return (void *) __get_free_pages(gfp, get_order(size));
2b188cc1
JA
9330}
9331
75b28aff
HV
9332static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
9333 size_t *sq_offset)
9334{
9335 struct io_rings *rings;
9336 size_t off, sq_array_size;
9337
9338 off = struct_size(rings, cqes, cq_entries);
9339 if (off == SIZE_MAX)
9340 return SIZE_MAX;
9341
9342#ifdef CONFIG_SMP
9343 off = ALIGN(off, SMP_CACHE_BYTES);
9344 if (off == 0)
9345 return SIZE_MAX;
9346#endif
9347
b36200f5
DV
9348 if (sq_offset)
9349 *sq_offset = off;
9350
75b28aff
HV
9351 sq_array_size = array_size(sizeof(u32), sq_entries);
9352 if (sq_array_size == SIZE_MAX)
9353 return SIZE_MAX;
9354
9355 if (check_add_overflow(off, sq_array_size, &off))
9356 return SIZE_MAX;
9357
75b28aff
HV
9358 return off;
9359}
9360
41edf1a5 9361static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
7f61a1e9 9362{
41edf1a5 9363 struct io_mapped_ubuf *imu = *slot;
7f61a1e9
PB
9364 unsigned int i;
9365
6224843d
PB
9366 if (imu != ctx->dummy_ubuf) {
9367 for (i = 0; i < imu->nr_bvecs; i++)
9368 unpin_user_page(imu->bvec[i].bv_page);
9369 if (imu->acct_pages)
9370 io_unaccount_mem(ctx, imu->acct_pages);
9371 kvfree(imu);
9372 }
41edf1a5 9373 *slot = NULL;
7f61a1e9
PB
9374}
9375
bd54b6fe 9376static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
edafccee 9377{
634d00df
PB
9378 io_buffer_unmap(ctx, &prsrc->buf);
9379 prsrc->buf = NULL;
bd54b6fe 9380}
edafccee 9381
bd54b6fe
BM
9382static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
9383{
9384 unsigned int i;
edafccee 9385
7f61a1e9
PB
9386 for (i = 0; i < ctx->nr_user_bufs; i++)
9387 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
edafccee 9388 kfree(ctx->user_bufs);
bb6659cc 9389 io_rsrc_data_free(ctx->buf_data);
edafccee 9390 ctx->user_bufs = NULL;
bd54b6fe 9391 ctx->buf_data = NULL;
edafccee 9392 ctx->nr_user_bufs = 0;
bd54b6fe
BM
9393}
9394
0a96bbe4 9395static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
edafccee 9396{
bd54b6fe 9397 int ret;
edafccee 9398
bd54b6fe 9399 if (!ctx->buf_data)
edafccee
JA
9400 return -ENXIO;
9401
bd54b6fe
BM
9402 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
9403 if (!ret)
9404 __io_sqe_buffers_unregister(ctx);
9405 return ret;
edafccee
JA
9406}
9407
9408static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
9409 void __user *arg, unsigned index)
9410{
9411 struct iovec __user *src;
9412
9413#ifdef CONFIG_COMPAT
9414 if (ctx->compat) {
9415 struct compat_iovec __user *ciovs;
9416 struct compat_iovec ciov;
9417
9418 ciovs = (struct compat_iovec __user *) arg;
9419 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
9420 return -EFAULT;
9421
d55e5f5b 9422 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
9423 dst->iov_len = ciov.iov_len;
9424 return 0;
9425 }
9426#endif
9427 src = (struct iovec __user *) arg;
9428 if (copy_from_user(dst, &src[index], sizeof(*dst)))
9429 return -EFAULT;
9430 return 0;
9431}
9432
de293938
JA
9433/*
9434 * Not super efficient, but this is just a registration time. And we do cache
9435 * the last compound head, so generally we'll only do a full search if we don't
9436 * match that one.
9437 *
9438 * We check if the given compound head page has already been accounted, to
9439 * avoid double accounting it. This allows us to account the full size of the
9440 * page, not just the constituent pages of a huge page.
9441 */
9442static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
9443 int nr_pages, struct page *hpage)
9444{
9445 int i, j;
9446
9447 /* check current page array */
9448 for (i = 0; i < nr_pages; i++) {
9449 if (!PageCompound(pages[i]))
9450 continue;
9451 if (compound_head(pages[i]) == hpage)
9452 return true;
9453 }
9454
9455 /* check previously registered pages */
9456 for (i = 0; i < ctx->nr_user_bufs; i++) {
41edf1a5 9457 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
de293938
JA
9458
9459 for (j = 0; j < imu->nr_bvecs; j++) {
9460 if (!PageCompound(imu->bvec[j].bv_page))
9461 continue;
9462 if (compound_head(imu->bvec[j].bv_page) == hpage)
9463 return true;
9464 }
9465 }
9466
9467 return false;
9468}
9469
9470static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
9471 int nr_pages, struct io_mapped_ubuf *imu,
9472 struct page **last_hpage)
9473{
9474 int i, ret;
9475
216e5835 9476 imu->acct_pages = 0;
de293938
JA
9477 for (i = 0; i < nr_pages; i++) {
9478 if (!PageCompound(pages[i])) {
9479 imu->acct_pages++;
9480 } else {
9481 struct page *hpage;
9482
9483 hpage = compound_head(pages[i]);
9484 if (hpage == *last_hpage)
9485 continue;
9486 *last_hpage = hpage;
9487 if (headpage_already_acct(ctx, pages, i, hpage))
9488 continue;
9489 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
9490 }
9491 }
9492
9493 if (!imu->acct_pages)
9494 return 0;
9495
26bfa89e 9496 ret = io_account_mem(ctx, imu->acct_pages);
de293938
JA
9497 if (ret)
9498 imu->acct_pages = 0;
9499 return ret;
9500}
9501
0a96bbe4 9502static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
41edf1a5 9503 struct io_mapped_ubuf **pimu,
0a96bbe4 9504 struct page **last_hpage)
edafccee 9505{
41edf1a5 9506 struct io_mapped_ubuf *imu = NULL;
edafccee
JA
9507 struct vm_area_struct **vmas = NULL;
9508 struct page **pages = NULL;
0a96bbe4
BM
9509 unsigned long off, start, end, ubuf;
9510 size_t size;
9511 int ret, pret, nr_pages, i;
9512
6224843d
PB
9513 if (!iov->iov_base) {
9514 *pimu = ctx->dummy_ubuf;
9515 return 0;
9516 }
9517
0a96bbe4
BM
9518 ubuf = (unsigned long) iov->iov_base;
9519 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
9520 start = ubuf >> PAGE_SHIFT;
9521 nr_pages = end - start;
9522
41edf1a5 9523 *pimu = NULL;
0a96bbe4
BM
9524 ret = -ENOMEM;
9525
9526 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
9527 if (!pages)
9528 goto done;
9529
9530 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
9531 GFP_KERNEL);
9532 if (!vmas)
9533 goto done;
edafccee 9534
41edf1a5 9535 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
a2b4198c 9536 if (!imu)
0a96bbe4
BM
9537 goto done;
9538
9539 ret = 0;
9540 mmap_read_lock(current->mm);
9541 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
9542 pages, vmas);
9543 if (pret == nr_pages) {
9544 /* don't support file backed memory */
9545 for (i = 0; i < nr_pages; i++) {
9546 struct vm_area_struct *vma = vmas[i];
9547
40dad765
PB
9548 if (vma_is_shmem(vma))
9549 continue;
0a96bbe4
BM
9550 if (vma->vm_file &&
9551 !is_file_hugepages(vma->vm_file)) {
9552 ret = -EOPNOTSUPP;
9553 break;
9554 }
9555 }
9556 } else {
9557 ret = pret < 0 ? pret : -EFAULT;
9558 }
9559 mmap_read_unlock(current->mm);
9560 if (ret) {
9561 /*
9562 * if we did partial map, or found file backed vmas,
9563 * release any pages we did get
9564 */
9565 if (pret > 0)
9566 unpin_user_pages(pages, pret);
0a96bbe4
BM
9567 goto done;
9568 }
9569
9570 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
9571 if (ret) {
9572 unpin_user_pages(pages, pret);
0a96bbe4
BM
9573 goto done;
9574 }
9575
9576 off = ubuf & ~PAGE_MASK;
9577 size = iov->iov_len;
9578 for (i = 0; i < nr_pages; i++) {
9579 size_t vec_len;
9580
9581 vec_len = min_t(size_t, size, PAGE_SIZE - off);
9582 imu->bvec[i].bv_page = pages[i];
9583 imu->bvec[i].bv_len = vec_len;
9584 imu->bvec[i].bv_offset = off;
9585 off = 0;
9586 size -= vec_len;
9587 }
9588 /* store original address for later verification */
9589 imu->ubuf = ubuf;
4751f53d 9590 imu->ubuf_end = ubuf + iov->iov_len;
0a96bbe4 9591 imu->nr_bvecs = nr_pages;
41edf1a5 9592 *pimu = imu;
0a96bbe4
BM
9593 ret = 0;
9594done:
41edf1a5
PB
9595 if (ret)
9596 kvfree(imu);
0a96bbe4
BM
9597 kvfree(pages);
9598 kvfree(vmas);
9599 return ret;
9600}
9601
2b358604 9602static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
0a96bbe4 9603{
87094465
PB
9604 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
9605 return ctx->user_bufs ? 0 : -ENOMEM;
2b358604 9606}
edafccee 9607
2b358604
BM
9608static int io_buffer_validate(struct iovec *iov)
9609{
50e96989
PB
9610 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
9611
2b358604
BM
9612 /*
9613 * Don't impose further limits on the size and buffer
9614 * constraints here, we'll -EINVAL later when IO is
9615 * submitted if they are wrong.
9616 */
6224843d
PB
9617 if (!iov->iov_base)
9618 return iov->iov_len ? -EFAULT : 0;
9619 if (!iov->iov_len)
2b358604 9620 return -EFAULT;
edafccee 9621
2b358604
BM
9622 /* arbitrary limit, but we need something */
9623 if (iov->iov_len > SZ_1G)
9624 return -EFAULT;
edafccee 9625
50e96989
PB
9626 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
9627 return -EOVERFLOW;
9628
2b358604
BM
9629 return 0;
9630}
edafccee 9631
2b358604 9632static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
634d00df 9633 unsigned int nr_args, u64 __user *tags)
2b358604 9634{
bd54b6fe
BM
9635 struct page *last_hpage = NULL;
9636 struct io_rsrc_data *data;
2b358604
BM
9637 int i, ret;
9638 struct iovec iov;
edafccee 9639
87094465
PB
9640 if (ctx->user_bufs)
9641 return -EBUSY;
489809e2 9642 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
87094465 9643 return -EINVAL;
bd54b6fe 9644 ret = io_rsrc_node_switch_start(ctx);
2b358604
BM
9645 if (ret)
9646 return ret;
d878c816
PB
9647 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9648 if (ret)
9649 return ret;
bd54b6fe
BM
9650 ret = io_buffers_map_alloc(ctx, nr_args);
9651 if (ret) {
bb6659cc 9652 io_rsrc_data_free(data);
bd54b6fe
BM
9653 return ret;
9654 }
edafccee 9655
87094465 9656 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
edafccee
JA
9657 ret = io_copy_iov(ctx, &iov, arg, i);
9658 if (ret)
0a96bbe4 9659 break;
2b358604
BM
9660 ret = io_buffer_validate(&iov);
9661 if (ret)
0a96bbe4 9662 break;
2d091d62 9663 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
cf3770e7
CIK
9664 ret = -EINVAL;
9665 break;
9666 }
edafccee 9667
41edf1a5
PB
9668 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9669 &last_hpage);
0a96bbe4
BM
9670 if (ret)
9671 break;
edafccee 9672 }
0a96bbe4 9673
bd54b6fe 9674 WARN_ON_ONCE(ctx->buf_data);
0a96bbe4 9675
bd54b6fe
BM
9676 ctx->buf_data = data;
9677 if (ret)
9678 __io_sqe_buffers_unregister(ctx);
9679 else
9680 io_rsrc_node_switch(ctx, NULL);
edafccee
JA
9681 return ret;
9682}
9683
634d00df
PB
9684static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9685 struct io_uring_rsrc_update2 *up,
9686 unsigned int nr_args)
9687{
9688 u64 __user *tags = u64_to_user_ptr(up->tags);
9689 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
634d00df
PB
9690 struct page *last_hpage = NULL;
9691 bool needs_switch = false;
9692 __u32 done;
9693 int i, err;
9694
9695 if (!ctx->buf_data)
9696 return -ENXIO;
9697 if (up->offset + nr_args > ctx->nr_user_bufs)
9698 return -EINVAL;
9699
9700 for (done = 0; done < nr_args; done++) {
0b8c0e7c
PB
9701 struct io_mapped_ubuf *imu;
9702 int offset = up->offset + done;
634d00df
PB
9703 u64 tag = 0;
9704
9705 err = io_copy_iov(ctx, &iov, iovs, done);
9706 if (err)
9707 break;
9708 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9709 err = -EFAULT;
9710 break;
9711 }
0b8c0e7c
PB
9712 err = io_buffer_validate(&iov);
9713 if (err)
9714 break;
cf3770e7
CIK
9715 if (!iov.iov_base && tag) {
9716 err = -EINVAL;
9717 break;
9718 }
0b8c0e7c
PB
9719 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9720 if (err)
9721 break;
634d00df 9722
0b8c0e7c 9723 i = array_index_nospec(offset, ctx->nr_user_bufs);
6224843d 9724 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
4cdd158b 9725 err = io_queue_rsrc_removal(ctx->buf_data, i,
0b8c0e7c
PB
9726 ctx->rsrc_node, ctx->user_bufs[i]);
9727 if (unlikely(err)) {
9728 io_buffer_unmap(ctx, &imu);
634d00df 9729 break;
0b8c0e7c 9730 }
634d00df
PB
9731 ctx->user_bufs[i] = NULL;
9732 needs_switch = true;
9733 }
9734
0b8c0e7c 9735 ctx->user_bufs[i] = imu;
2d091d62 9736 *io_get_tag_slot(ctx->buf_data, offset) = tag;
634d00df
PB
9737 }
9738
9739 if (needs_switch)
9740 io_rsrc_node_switch(ctx, ctx->buf_data);
9741 return done ? done : err;
9742}
9743
c75312dd
UA
9744static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
9745 unsigned int eventfd_async)
9b402849 9746{
77bc59b4 9747 struct io_ev_fd *ev_fd;
9b402849 9748 __s32 __user *fds = arg;
f0a4e62b 9749 int fd;
9b402849 9750
77bc59b4
UA
9751 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
9752 lockdep_is_held(&ctx->uring_lock));
9753 if (ev_fd)
9b402849
JA
9754 return -EBUSY;
9755
9756 if (copy_from_user(&fd, fds, sizeof(*fds)))
9757 return -EFAULT;
9758
77bc59b4
UA
9759 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
9760 if (!ev_fd)
9761 return -ENOMEM;
fe7e3257 9762
77bc59b4
UA
9763 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
9764 if (IS_ERR(ev_fd->cq_ev_fd)) {
f0a4e62b 9765 int ret = PTR_ERR(ev_fd->cq_ev_fd);
77bc59b4 9766 kfree(ev_fd);
9b402849
JA
9767 return ret;
9768 }
c75312dd 9769 ev_fd->eventfd_async = eventfd_async;
9aa8dfde 9770 ctx->has_evfd = true;
77bc59b4 9771 rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
f0a4e62b 9772 return 0;
77bc59b4
UA
9773}
9774
9775static void io_eventfd_put(struct rcu_head *rcu)
9776{
9777 struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
9778
9779 eventfd_ctx_put(ev_fd->cq_ev_fd);
9780 kfree(ev_fd);
9b402849
JA
9781}
9782
9783static int io_eventfd_unregister(struct io_ring_ctx *ctx)
9784{
77bc59b4
UA
9785 struct io_ev_fd *ev_fd;
9786
9787 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
9788 lockdep_is_held(&ctx->uring_lock));
9789 if (ev_fd) {
9aa8dfde 9790 ctx->has_evfd = false;
77bc59b4
UA
9791 rcu_assign_pointer(ctx->io_ev_fd, NULL);
9792 call_rcu(&ev_fd->rcu, io_eventfd_put);
9b402849
JA
9793 return 0;
9794 }
9795
9796 return -ENXIO;
9797}
9798
5a2e745d
JA
9799static void io_destroy_buffers(struct io_ring_ctx *ctx)
9800{
dbc7d452
JA
9801 int i;
9802
9803 for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
9804 struct list_head *list = &ctx->io_buffers[i];
9e15c3a0 9805
dbc7d452
JA
9806 while (!list_empty(list)) {
9807 struct io_buffer_list *bl;
9808
9809 bl = list_first_entry(list, struct io_buffer_list, list);
9810 __io_remove_buffers(ctx, bl, -1U);
9811 list_del(&bl->list);
9812 kfree(bl);
9813 }
9814 }
cc3cec83
JA
9815
9816 while (!list_empty(&ctx->io_buffers_pages)) {
9817 struct page *page;
9818
9819 page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
9820 list_del_init(&page->lru);
9821 __free_page(page);
9822 }
5a2e745d
JA
9823}
9824
4010fec4 9825static void io_req_caches_free(struct io_ring_ctx *ctx)
2b188cc1 9826{
cd0ca2e0 9827 struct io_submit_state *state = &ctx->submit_state;
37f0e767 9828 int nr = 0;
bf019da7 9829
9a4fdbd8 9830 mutex_lock(&ctx->uring_lock);
cd0ca2e0 9831 io_flush_cached_locked_reqs(ctx, state);
9a4fdbd8 9832
88ab95be 9833 while (!io_req_cache_empty(ctx)) {
c2b6c6bc
PB
9834 struct io_wq_work_node *node;
9835 struct io_kiocb *req;
9a4fdbd8 9836
c2b6c6bc
PB
9837 node = wq_stack_extract(&state->free_list);
9838 req = container_of(node, struct io_kiocb, comp_list);
9839 kmem_cache_free(req_cachep, req);
37f0e767 9840 nr++;
c2b6c6bc 9841 }
37f0e767
PB
9842 if (nr)
9843 percpu_ref_put_many(&ctx->refs, nr);
9a4fdbd8
JA
9844 mutex_unlock(&ctx->uring_lock);
9845}
9846
43597aac 9847static void io_wait_rsrc_data(struct io_rsrc_data *data)
2b188cc1 9848{
43597aac 9849 if (data && !atomic_dec_and_test(&data->refs))
bd54b6fe 9850 wait_for_completion(&data->done);
bd54b6fe 9851}
04fc6c80 9852
4d9237e3
JA
9853static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
9854{
9855 struct async_poll *apoll;
9856
9857 while (!list_empty(&ctx->apoll_cache)) {
9858 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
9859 poll.wait.entry);
9860 list_del(&apoll->poll.wait.entry);
9861 kfree(apoll);
9862 }
9863}
9864
c072481d 9865static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
2b188cc1 9866{
37d1e2e3 9867 io_sq_thread_finish(ctx);
2aede0e4 9868
37d1e2e3 9869 if (ctx->mm_account) {
2aede0e4
JA
9870 mmdrop(ctx->mm_account);
9871 ctx->mm_account = NULL;
30975825 9872 }
def596e9 9873
ab409402 9874 io_rsrc_refs_drop(ctx);
43597aac
PB
9875 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
9876 io_wait_rsrc_data(ctx->buf_data);
9877 io_wait_rsrc_data(ctx->file_data);
9878
8bad28d8 9879 mutex_lock(&ctx->uring_lock);
43597aac 9880 if (ctx->buf_data)
bd54b6fe 9881 __io_sqe_buffers_unregister(ctx);
43597aac 9882 if (ctx->file_data)
08480400 9883 __io_sqe_files_unregister(ctx);
c4ea060e
PB
9884 if (ctx->rings)
9885 __io_cqring_overflow_flush(ctx, true);
9b402849 9886 io_eventfd_unregister(ctx);
4d9237e3 9887 io_flush_apoll_cache(ctx);
77bc59b4 9888 mutex_unlock(&ctx->uring_lock);
5a2e745d 9889 io_destroy_buffers(ctx);
07db298a
PB
9890 if (ctx->sq_creds)
9891 put_cred(ctx->sq_creds);
def596e9 9892
a7f0ed5a
PB
9893 /* there are no registered resources left, nobody uses it */
9894 if (ctx->rsrc_node)
9895 io_rsrc_node_destroy(ctx->rsrc_node);
8dd03afe 9896 if (ctx->rsrc_backup_node)
b895c9a6 9897 io_rsrc_node_destroy(ctx->rsrc_backup_node);
a7f0ed5a 9898 flush_delayed_work(&ctx->rsrc_put_work);
756ab7c0 9899 flush_delayed_work(&ctx->fallback_work);
a7f0ed5a
PB
9900
9901 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
9902 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
def596e9 9903
2b188cc1 9904#if defined(CONFIG_UNIX)
355e8d26
EB
9905 if (ctx->ring_sock) {
9906 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 9907 sock_release(ctx->ring_sock);
355e8d26 9908 }
2b188cc1 9909#endif
ef9dd637 9910 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
2b188cc1 9911
75b28aff 9912 io_mem_free(ctx->rings);
2b188cc1 9913 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
9914
9915 percpu_ref_exit(&ctx->refs);
2b188cc1 9916 free_uid(ctx->user);
4010fec4 9917 io_req_caches_free(ctx);
e941894e
JA
9918 if (ctx->hash_map)
9919 io_wq_put_hash(ctx->hash_map);
78076bb6 9920 kfree(ctx->cancel_hash);
6224843d 9921 kfree(ctx->dummy_ubuf);
dbc7d452 9922 kfree(ctx->io_buffers);
2b188cc1
JA
9923 kfree(ctx);
9924}
9925
9926static __poll_t io_uring_poll(struct file *file, poll_table *wait)
9927{
9928 struct io_ring_ctx *ctx = file->private_data;
9929 __poll_t mask = 0;
9930
d60aa65b 9931 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
9932 /*
9933 * synchronizes with barrier from wq_has_sleeper call in
9934 * io_commit_cqring
9935 */
2b188cc1 9936 smp_rmb();
90554200 9937 if (!io_sqring_full(ctx))
2b188cc1 9938 mask |= EPOLLOUT | EPOLLWRNORM;
ed670c3f
HX
9939
9940 /*
9941 * Don't flush cqring overflow list here, just do a simple check.
9942 * Otherwise there could possible be ABBA deadlock:
9943 * CPU0 CPU1
9944 * ---- ----
9945 * lock(&ctx->uring_lock);
9946 * lock(&ep->mtx);
9947 * lock(&ctx->uring_lock);
9948 * lock(&ep->mtx);
9949 *
9950 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
9951 * pushs them to do the flush.
9952 */
5ed7a37d 9953 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
2b188cc1
JA
9954 mask |= EPOLLIN | EPOLLRDNORM;
9955
9956 return mask;
9957}
9958
0bead8cd 9959static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
071698e1 9960{
4379bf8b 9961 const struct cred *creds;
071698e1 9962
61cf9370 9963 creds = xa_erase(&ctx->personalities, id);
4379bf8b
JA
9964 if (creds) {
9965 put_cred(creds);
0bead8cd 9966 return 0;
1e6fa521 9967 }
0bead8cd
YD
9968
9969 return -EINVAL;
9970}
9971
d56d938b
PB
9972struct io_tctx_exit {
9973 struct callback_head task_work;
9974 struct completion completion;
baf186c4 9975 struct io_ring_ctx *ctx;
d56d938b
PB
9976};
9977
c072481d 9978static __cold void io_tctx_exit_cb(struct callback_head *cb)
d56d938b
PB
9979{
9980 struct io_uring_task *tctx = current->io_uring;
9981 struct io_tctx_exit *work;
9982
9983 work = container_of(cb, struct io_tctx_exit, task_work);
9984 /*
9985 * When @in_idle, we're in cancellation and it's racy to remove the
9986 * node. It'll be removed by the end of cancellation, just ignore it.
9987 */
9988 if (!atomic_read(&tctx->in_idle))
eef51daa 9989 io_uring_del_tctx_node((unsigned long)work->ctx);
d56d938b
PB
9990 complete(&work->completion);
9991}
9992
c072481d 9993static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
28090c13
PB
9994{
9995 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9996
9997 return req->ctx == data;
9998}
9999
c072481d 10000static __cold void io_ring_exit_work(struct work_struct *work)
85faa7b8 10001{
d56d938b 10002 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
b5bb3a24 10003 unsigned long timeout = jiffies + HZ * 60 * 5;
58d3be2c 10004 unsigned long interval = HZ / 20;
d56d938b
PB
10005 struct io_tctx_exit exit;
10006 struct io_tctx_node *node;
10007 int ret;
85faa7b8 10008
56952e91
JA
10009 /*
10010 * If we're doing polled IO and end up having requests being
10011 * submitted async (out-of-line), then completions can come in while
10012 * we're waiting for refs to drop. We need to reap these manually,
10013 * as nobody else will be looking for them.
10014 */
b2edc0a7 10015 do {
3dd0c97a 10016 io_uring_try_cancel_requests(ctx, NULL, true);
28090c13
PB
10017 if (ctx->sq_data) {
10018 struct io_sq_data *sqd = ctx->sq_data;
10019 struct task_struct *tsk;
10020
10021 io_sq_thread_park(sqd);
10022 tsk = sqd->thread;
10023 if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
10024 io_wq_cancel_cb(tsk->io_uring->io_wq,
10025 io_cancel_ctx_cb, ctx, true);
10026 io_sq_thread_unpark(sqd);
10027 }
b5bb3a24 10028
37f0e767
PB
10029 io_req_caches_free(ctx);
10030
58d3be2c
PB
10031 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
10032 /* there is little hope left, don't run it too often */
10033 interval = HZ * 60;
10034 }
10035 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
d56d938b 10036
7f00651a
PB
10037 init_completion(&exit.completion);
10038 init_task_work(&exit.task_work, io_tctx_exit_cb);
10039 exit.ctx = ctx;
89b5066e
PB
10040 /*
10041 * Some may use context even when all refs and requests have been put,
10042 * and they are free to do so while still holding uring_lock or
5b0a6acc 10043 * completion_lock, see io_req_task_submit(). Apart from other work,
89b5066e
PB
10044 * this lock/unlock section also waits them to finish.
10045 */
d56d938b
PB
10046 mutex_lock(&ctx->uring_lock);
10047 while (!list_empty(&ctx->tctx_list)) {
b5bb3a24
PB
10048 WARN_ON_ONCE(time_after(jiffies, timeout));
10049
d56d938b
PB
10050 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
10051 ctx_node);
7f00651a
PB
10052 /* don't spin on a single task if cancellation failed */
10053 list_rotate_left(&ctx->tctx_list);
d56d938b
PB
10054 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
10055 if (WARN_ON_ONCE(ret))
10056 continue;
d56d938b
PB
10057
10058 mutex_unlock(&ctx->uring_lock);
10059 wait_for_completion(&exit.completion);
d56d938b
PB
10060 mutex_lock(&ctx->uring_lock);
10061 }
10062 mutex_unlock(&ctx->uring_lock);
79ebeaee
JA
10063 spin_lock(&ctx->completion_lock);
10064 spin_unlock(&ctx->completion_lock);
d56d938b 10065
85faa7b8
JA
10066 io_ring_ctx_free(ctx);
10067}
10068
80c4cbdb 10069/* Returns true if we found and killed one or more timeouts */
c072481d
PB
10070static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
10071 struct task_struct *tsk, bool cancel_all)
80c4cbdb
PB
10072{
10073 struct io_kiocb *req, *tmp;
10074 int canceled = 0;
10075
79ebeaee
JA
10076 spin_lock(&ctx->completion_lock);
10077 spin_lock_irq(&ctx->timeout_lock);
80c4cbdb 10078 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
3dd0c97a 10079 if (io_match_task(req, tsk, cancel_all)) {
80c4cbdb
PB
10080 io_kill_timeout(req, -ECANCELED);
10081 canceled++;
10082 }
10083 }
79ebeaee 10084 spin_unlock_irq(&ctx->timeout_lock);
60053be8 10085 io_commit_cqring(ctx);
79ebeaee 10086 spin_unlock(&ctx->completion_lock);
80c4cbdb
PB
10087 if (canceled != 0)
10088 io_cqring_ev_posted(ctx);
10089 return canceled != 0;
10090}
10091
c072481d 10092static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2b188cc1 10093{
61cf9370
MWO
10094 unsigned long index;
10095 struct creds *creds;
10096
2b188cc1
JA
10097 mutex_lock(&ctx->uring_lock);
10098 percpu_ref_kill(&ctx->refs);
634578f8 10099 if (ctx->rings)
6c2450ae 10100 __io_cqring_overflow_flush(ctx, true);
61cf9370
MWO
10101 xa_for_each(&ctx->personalities, index, creds)
10102 io_unregister_personality(ctx, index);
2b188cc1
JA
10103 mutex_unlock(&ctx->uring_lock);
10104
60053be8
PB
10105 /* failed during ring init, it couldn't have issued any requests */
10106 if (ctx->rings) {
10107 io_kill_timeouts(ctx, NULL, true);
10108 io_poll_remove_all(ctx, NULL, true);
10109 /* if we failed setting up the ctx, we might not have any rings */
10110 io_iopoll_try_reap_events(ctx);
10111 }
309fc03a 10112
85faa7b8 10113 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
fc666777
JA
10114 /*
10115 * Use system_unbound_wq to avoid spawning tons of event kworkers
10116 * if we're exiting a ton of rings at the same time. It just adds
10117 * noise and overhead, there's no discernable change in runtime
10118 * over using system_wq.
10119 */
10120 queue_work(system_unbound_wq, &ctx->exit_work);
2b188cc1
JA
10121}
10122
10123static int io_uring_release(struct inode *inode, struct file *file)
10124{
10125 struct io_ring_ctx *ctx = file->private_data;
10126
10127 file->private_data = NULL;
10128 io_ring_ctx_wait_and_kill(ctx);
10129 return 0;
10130}
10131
f6edbabb
PB
10132struct io_task_cancel {
10133 struct task_struct *task;
3dd0c97a 10134 bool all;
f6edbabb 10135};
f254ac04 10136
f6edbabb 10137static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
b711d4ea 10138{
9a472ef7 10139 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f6edbabb 10140 struct io_task_cancel *cancel = data;
9a472ef7 10141
6af3f48b 10142 return io_match_task_safe(req, cancel->task, cancel->all);
b711d4ea
JA
10143}
10144
c072481d
PB
10145static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
10146 struct task_struct *task,
10147 bool cancel_all)
b7ddce3c 10148{
e1915f76 10149 struct io_defer_entry *de;
b7ddce3c
PB
10150 LIST_HEAD(list);
10151
79ebeaee 10152 spin_lock(&ctx->completion_lock);
b7ddce3c 10153 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
6af3f48b 10154 if (io_match_task_safe(de->req, task, cancel_all)) {
b7ddce3c
PB
10155 list_cut_position(&list, &ctx->defer_list, &de->list);
10156 break;
10157 }
10158 }
79ebeaee 10159 spin_unlock(&ctx->completion_lock);
e1915f76
PB
10160 if (list_empty(&list))
10161 return false;
b7ddce3c
PB
10162
10163 while (!list_empty(&list)) {
10164 de = list_first_entry(&list, struct io_defer_entry, list);
10165 list_del_init(&de->list);
f41db273 10166 io_req_complete_failed(de->req, -ECANCELED);
b7ddce3c
PB
10167 kfree(de);
10168 }
e1915f76 10169 return true;
b7ddce3c
PB
10170}
10171
c072481d 10172static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
1b00764f
PB
10173{
10174 struct io_tctx_node *node;
10175 enum io_wq_cancel cret;
10176 bool ret = false;
10177
10178 mutex_lock(&ctx->uring_lock);
10179 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
10180 struct io_uring_task *tctx = node->task->io_uring;
10181
10182 /*
10183 * io_wq will stay alive while we hold uring_lock, because it's
10184 * killed after ctx nodes, which requires to take the lock.
10185 */
10186 if (!tctx || !tctx->io_wq)
10187 continue;
10188 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
10189 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
10190 }
10191 mutex_unlock(&ctx->uring_lock);
10192
10193 return ret;
10194}
10195
c072481d
PB
10196static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
10197 struct task_struct *task,
10198 bool cancel_all)
9936c7c2 10199{
3dd0c97a 10200 struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
1b00764f 10201 struct io_uring_task *tctx = task ? task->io_uring : NULL;
9936c7c2 10202
60053be8
PB
10203 /* failed during ring init, it couldn't have issued any requests */
10204 if (!ctx->rings)
10205 return;
10206
9936c7c2
PB
10207 while (1) {
10208 enum io_wq_cancel cret;
10209 bool ret = false;
10210
1b00764f
PB
10211 if (!task) {
10212 ret |= io_uring_try_cancel_iowq(ctx);
10213 } else if (tctx && tctx->io_wq) {
10214 /*
10215 * Cancels requests of all rings, not only @ctx, but
10216 * it's fine as the task is in exit/exec.
10217 */
5aa75ed5 10218 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9936c7c2
PB
10219 &cancel, true);
10220 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
10221 }
10222
10223 /* SQPOLL thread does its own polling */
3dd0c97a 10224 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
d052d1d6 10225 (ctx->sq_data && ctx->sq_data->thread == current)) {
5eef4e87 10226 while (!wq_list_empty(&ctx->iopoll_list)) {
9936c7c2
PB
10227 io_iopoll_try_reap_events(ctx);
10228 ret = true;
10229 }
10230 }
10231
3dd0c97a
PB
10232 ret |= io_cancel_defer_files(ctx, task, cancel_all);
10233 ret |= io_poll_remove_all(ctx, task, cancel_all);
10234 ret |= io_kill_timeouts(ctx, task, cancel_all);
e5dc480d
PB
10235 if (task)
10236 ret |= io_run_task_work();
9936c7c2
PB
10237 if (!ret)
10238 break;
10239 cond_resched();
10240 }
10241}
10242
eef51daa 10243static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
0f212204 10244{
236434c3 10245 struct io_uring_task *tctx = current->io_uring;
13bf43f5 10246 struct io_tctx_node *node;
a528b04e 10247 int ret;
236434c3
MWO
10248
10249 if (unlikely(!tctx)) {
5aa75ed5 10250 ret = io_uring_alloc_task_context(current, ctx);
0f212204
JA
10251 if (unlikely(ret))
10252 return ret;
e139a1ec 10253
236434c3 10254 tctx = current->io_uring;
e139a1ec
PB
10255 if (ctx->iowq_limits_set) {
10256 unsigned int limits[2] = { ctx->iowq_limits[0],
10257 ctx->iowq_limits[1], };
10258
10259 ret = io_wq_max_workers(tctx->io_wq, limits);
10260 if (ret)
10261 return ret;
10262 }
0f212204 10263 }
cf27f3b1
PB
10264 if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
10265 node = kmalloc(sizeof(*node), GFP_KERNEL);
10266 if (!node)
10267 return -ENOMEM;
10268 node->ctx = ctx;
10269 node->task = current;
13bf43f5 10270
cf27f3b1
PB
10271 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
10272 node, GFP_KERNEL));
10273 if (ret) {
10274 kfree(node);
10275 return ret;
0f212204 10276 }
cf27f3b1
PB
10277
10278 mutex_lock(&ctx->uring_lock);
10279 list_add(&node->ctx_node, &ctx->tctx_list);
10280 mutex_unlock(&ctx->uring_lock);
0f212204 10281 }
cf27f3b1 10282 tctx->last = ctx;
0f212204
JA
10283 return 0;
10284}
10285
cf27f3b1
PB
10286/*
10287 * Note that this task has used io_uring. We use it for cancelation purposes.
10288 */
eef51daa 10289static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
cf27f3b1
PB
10290{
10291 struct io_uring_task *tctx = current->io_uring;
10292
10293 if (likely(tctx && tctx->last == ctx))
10294 return 0;
eef51daa 10295 return __io_uring_add_tctx_node(ctx);
cf27f3b1
PB
10296}
10297
0f212204
JA
10298/*
10299 * Remove this io_uring_file -> task mapping.
10300 */
c072481d 10301static __cold void io_uring_del_tctx_node(unsigned long index)
0f212204
JA
10302{
10303 struct io_uring_task *tctx = current->io_uring;
13bf43f5 10304 struct io_tctx_node *node;
2941267b 10305
eebd2e37
PB
10306 if (!tctx)
10307 return;
13bf43f5
PB
10308 node = xa_erase(&tctx->xa, index);
10309 if (!node)
2941267b 10310 return;
0f212204 10311
13bf43f5
PB
10312 WARN_ON_ONCE(current != node->task);
10313 WARN_ON_ONCE(list_empty(&node->ctx_node));
10314
10315 mutex_lock(&node->ctx->uring_lock);
10316 list_del(&node->ctx_node);
10317 mutex_unlock(&node->ctx->uring_lock);
10318
baf186c4 10319 if (tctx->last == node->ctx)
0f212204 10320 tctx->last = NULL;
13bf43f5 10321 kfree(node);
0f212204
JA
10322}
10323
c072481d 10324static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
de7f1d9e 10325{
ba5ef6dc 10326 struct io_wq *wq = tctx->io_wq;
13bf43f5 10327 struct io_tctx_node *node;
de7f1d9e
PB
10328 unsigned long index;
10329
8bab4c09 10330 xa_for_each(&tctx->xa, index, node) {
eef51daa 10331 io_uring_del_tctx_node(index);
8bab4c09
JA
10332 cond_resched();
10333 }
b16ef427
ME
10334 if (wq) {
10335 /*
f6f9b278 10336 * Must be after io_uring_del_tctx_node() (removes nodes under
b16ef427
ME
10337 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
10338 */
ba5ef6dc 10339 io_wq_put_and_exit(wq);
dadebc35 10340 tctx->io_wq = NULL;
b16ef427 10341 }
de7f1d9e
PB
10342}
10343
3f48cf18 10344static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
521d6a73 10345{
3f48cf18 10346 if (tracked)
d5361233 10347 return 0;
521d6a73
PB
10348 return percpu_counter_sum(&tctx->inflight);
10349}
10350
78cc687b
PB
10351/*
10352 * Find any io_uring ctx that this task has registered or done IO on, and cancel
78a78060 10353 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
78cc687b 10354 */
c072481d
PB
10355static __cold void io_uring_cancel_generic(bool cancel_all,
10356 struct io_sq_data *sqd)
0e9ddb39 10357{
521d6a73 10358 struct io_uring_task *tctx = current->io_uring;
734551df 10359 struct io_ring_ctx *ctx;
0e9ddb39
PB
10360 s64 inflight;
10361 DEFINE_WAIT(wait);
fdaf083c 10362
78cc687b
PB
10363 WARN_ON_ONCE(sqd && sqd->thread != current);
10364
6d042ffb
PO
10365 if (!current->io_uring)
10366 return;
17a91051
PB
10367 if (tctx->io_wq)
10368 io_wq_exit_start(tctx->io_wq);
10369
0e9ddb39
PB
10370 atomic_inc(&tctx->in_idle);
10371 do {
e9dbe221 10372 io_uring_drop_tctx_refs(current);
0e9ddb39 10373 /* read completions before cancelations */
78cc687b 10374 inflight = tctx_inflight(tctx, !cancel_all);
0e9ddb39
PB
10375 if (!inflight)
10376 break;
fdaf083c 10377
78cc687b
PB
10378 if (!sqd) {
10379 struct io_tctx_node *node;
10380 unsigned long index;
0f212204 10381
78cc687b
PB
10382 xa_for_each(&tctx->xa, index, node) {
10383 /* sqpoll task will cancel all its requests */
10384 if (node->ctx->sq_data)
10385 continue;
10386 io_uring_try_cancel_requests(node->ctx, current,
10387 cancel_all);
10388 }
10389 } else {
10390 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
10391 io_uring_try_cancel_requests(ctx, current,
10392 cancel_all);
10393 }
17a91051 10394
78a78060
JA
10395 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
10396 io_run_task_work();
e9dbe221 10397 io_uring_drop_tctx_refs(current);
78a78060 10398
0f212204 10399 /*
a1bb3cd5
PB
10400 * If we've seen completions, retry without waiting. This
10401 * avoids a race where a completion comes in before we did
10402 * prepare_to_wait().
0f212204 10403 */
3dd0c97a 10404 if (inflight == tctx_inflight(tctx, !cancel_all))
a1bb3cd5 10405 schedule();
f57555ed 10406 finish_wait(&tctx->wait, &wait);
d8a6df10 10407 } while (1);
de7f1d9e 10408
8452d4a6 10409 io_uring_clean_tctx(tctx);
3dd0c97a 10410 if (cancel_all) {
3cc7fdb9
PB
10411 /*
10412 * We shouldn't run task_works after cancel, so just leave
10413 * ->in_idle set for normal exit.
10414 */
10415 atomic_dec(&tctx->in_idle);
3f48cf18
PB
10416 /* for exec all current's requests should be gone, kill tctx */
10417 __io_uring_free(current);
10418 }
44e728b8
PB
10419}
10420
f552a27a 10421void __io_uring_cancel(bool cancel_all)
78cc687b 10422{
f552a27a 10423 io_uring_cancel_generic(cancel_all, NULL);
78cc687b
PB
10424}
10425
e7a6c00d
JA
10426void io_uring_unreg_ringfd(void)
10427{
10428 struct io_uring_task *tctx = current->io_uring;
10429 int i;
10430
10431 for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
10432 if (tctx->registered_rings[i]) {
10433 fput(tctx->registered_rings[i]);
10434 tctx->registered_rings[i] = NULL;
10435 }
10436 }
10437}
10438
10439static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
10440 int start, int end)
10441{
10442 struct file *file;
10443 int offset;
10444
10445 for (offset = start; offset < end; offset++) {
10446 offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
10447 if (tctx->registered_rings[offset])
10448 continue;
10449
10450 file = fget(fd);
10451 if (!file) {
10452 return -EBADF;
10453 } else if (file->f_op != &io_uring_fops) {
10454 fput(file);
10455 return -EOPNOTSUPP;
10456 }
10457 tctx->registered_rings[offset] = file;
10458 return offset;
10459 }
10460
10461 return -EBUSY;
10462}
10463
10464/*
10465 * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
10466 * invocation. User passes in an array of struct io_uring_rsrc_update
10467 * with ->data set to the ring_fd, and ->offset given for the desired
10468 * index. If no index is desired, application may set ->offset == -1U
10469 * and we'll find an available index. Returns number of entries
10470 * successfully processed, or < 0 on error if none were processed.
10471 */
10472static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
10473 unsigned nr_args)
10474{
10475 struct io_uring_rsrc_update __user *arg = __arg;
10476 struct io_uring_rsrc_update reg;
10477 struct io_uring_task *tctx;
10478 int ret, i;
10479
10480 if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
10481 return -EINVAL;
10482
10483 mutex_unlock(&ctx->uring_lock);
10484 ret = io_uring_add_tctx_node(ctx);
10485 mutex_lock(&ctx->uring_lock);
10486 if (ret)
10487 return ret;
10488
10489 tctx = current->io_uring;
10490 for (i = 0; i < nr_args; i++) {
10491 int start, end;
10492
10493 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
10494 ret = -EFAULT;
10495 break;
10496 }
10497
6fb53cf8
DY
10498 if (reg.resv) {
10499 ret = -EINVAL;
10500 break;
10501 }
10502
e7a6c00d
JA
10503 if (reg.offset == -1U) {
10504 start = 0;
10505 end = IO_RINGFD_REG_MAX;
10506 } else {
10507 if (reg.offset >= IO_RINGFD_REG_MAX) {
10508 ret = -EINVAL;
10509 break;
10510 }
10511 start = reg.offset;
10512 end = start + 1;
10513 }
10514
10515 ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
10516 if (ret < 0)
10517 break;
10518
10519 reg.offset = ret;
10520 if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
10521 fput(tctx->registered_rings[reg.offset]);
10522 tctx->registered_rings[reg.offset] = NULL;
10523 ret = -EFAULT;
10524 break;
10525 }
10526 }
10527
10528 return i ? i : ret;
10529}
10530
10531static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
10532 unsigned nr_args)
10533{
10534 struct io_uring_rsrc_update __user *arg = __arg;
10535 struct io_uring_task *tctx = current->io_uring;
10536 struct io_uring_rsrc_update reg;
10537 int ret = 0, i;
10538
10539 if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
10540 return -EINVAL;
10541 if (!tctx)
10542 return 0;
10543
10544 for (i = 0; i < nr_args; i++) {
10545 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
10546 ret = -EFAULT;
10547 break;
10548 }
6fb53cf8 10549 if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) {
e7a6c00d
JA
10550 ret = -EINVAL;
10551 break;
10552 }
10553
10554 reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
10555 if (tctx->registered_rings[reg.offset]) {
10556 fput(tctx->registered_rings[reg.offset]);
10557 tctx->registered_rings[reg.offset] = NULL;
10558 }
10559 }
10560
10561 return i ? i : ret;
10562}
10563
6c5c240e
RP
10564static void *io_uring_validate_mmap_request(struct file *file,
10565 loff_t pgoff, size_t sz)
2b188cc1 10566{
2b188cc1 10567 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 10568 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
10569 struct page *page;
10570 void *ptr;
10571
10572 switch (offset) {
10573 case IORING_OFF_SQ_RING:
75b28aff
HV
10574 case IORING_OFF_CQ_RING:
10575 ptr = ctx->rings;
2b188cc1
JA
10576 break;
10577 case IORING_OFF_SQES:
10578 ptr = ctx->sq_sqes;
10579 break;
2b188cc1 10580 default:
6c5c240e 10581 return ERR_PTR(-EINVAL);
2b188cc1
JA
10582 }
10583
10584 page = virt_to_head_page(ptr);
a50b854e 10585 if (sz > page_size(page))
6c5c240e
RP
10586 return ERR_PTR(-EINVAL);
10587
10588 return ptr;
10589}
10590
10591#ifdef CONFIG_MMU
10592
c072481d 10593static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6c5c240e
RP
10594{
10595 size_t sz = vma->vm_end - vma->vm_start;
10596 unsigned long pfn;
10597 void *ptr;
10598
10599 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
10600 if (IS_ERR(ptr))
10601 return PTR_ERR(ptr);
2b188cc1
JA
10602
10603 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
10604 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
10605}
10606
6c5c240e
RP
10607#else /* !CONFIG_MMU */
10608
10609static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
10610{
10611 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
10612}
10613
10614static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
10615{
10616 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
10617}
10618
10619static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
10620 unsigned long addr, unsigned long len,
10621 unsigned long pgoff, unsigned long flags)
10622{
10623 void *ptr;
10624
10625 ptr = io_uring_validate_mmap_request(file, pgoff, len);
10626 if (IS_ERR(ptr))
10627 return PTR_ERR(ptr);
10628
10629 return (unsigned long) ptr;
10630}
10631
10632#endif /* !CONFIG_MMU */
10633
d9d05217 10634static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
90554200
JA
10635{
10636 DEFINE_WAIT(wait);
10637
10638 do {
10639 if (!io_sqring_full(ctx))
10640 break;
90554200
JA
10641 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
10642
10643 if (!io_sqring_full(ctx))
10644 break;
90554200
JA
10645 schedule();
10646 } while (!signal_pending(current));
10647
10648 finish_wait(&ctx->sqo_sq_wait, &wait);
5199328a 10649 return 0;
90554200
JA
10650}
10651
f81440d3
PB
10652static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
10653{
10654 if (flags & IORING_ENTER_EXT_ARG) {
10655 struct io_uring_getevents_arg arg;
10656
10657 if (argsz != sizeof(arg))
10658 return -EINVAL;
10659 if (copy_from_user(&arg, argp, sizeof(arg)))
10660 return -EFAULT;
10661 }
10662 return 0;
10663}
10664
c73ebb68
HX
10665static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
10666 struct __kernel_timespec __user **ts,
10667 const sigset_t __user **sig)
10668{
10669 struct io_uring_getevents_arg arg;
10670
10671 /*
10672 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
10673 * is just a pointer to the sigset_t.
10674 */
10675 if (!(flags & IORING_ENTER_EXT_ARG)) {
10676 *sig = (const sigset_t __user *) argp;
10677 *ts = NULL;
10678 return 0;
10679 }
10680
10681 /*
10682 * EXT_ARG is set - ensure we agree on the size of it and copy in our
10683 * timespec and sigset_t pointers if good.
10684 */
10685 if (*argsz != sizeof(arg))
10686 return -EINVAL;
10687 if (copy_from_user(&arg, argp, sizeof(arg)))
10688 return -EFAULT;
d2347b96
DY
10689 if (arg.pad)
10690 return -EINVAL;
c73ebb68
HX
10691 *sig = u64_to_user_ptr(arg.sigmask);
10692 *argsz = arg.sigmask_sz;
10693 *ts = u64_to_user_ptr(arg.ts);
10694 return 0;
10695}
10696
2b188cc1 10697SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
c73ebb68
HX
10698 u32, min_complete, u32, flags, const void __user *, argp,
10699 size_t, argsz)
2b188cc1
JA
10700{
10701 struct io_ring_ctx *ctx;
2b188cc1
JA
10702 int submitted = 0;
10703 struct fd f;
33f993da 10704 long ret;
2b188cc1 10705
4c6e277c 10706 io_run_task_work();
b41e9852 10707
33f993da 10708 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
e7a6c00d
JA
10709 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
10710 IORING_ENTER_REGISTERED_RING)))
2b188cc1
JA
10711 return -EINVAL;
10712
e7a6c00d
JA
10713 /*
10714 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
10715 * need only dereference our task private array to find it.
10716 */
10717 if (flags & IORING_ENTER_REGISTERED_RING) {
10718 struct io_uring_task *tctx = current->io_uring;
10719
10720 if (!tctx || fd >= IO_RINGFD_REG_MAX)
10721 return -EINVAL;
10722 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
10723 f.file = tctx->registered_rings[fd];
10724 if (unlikely(!f.file))
10725 return -EBADF;
10726 } else {
10727 f = fdget(fd);
10728 if (unlikely(!f.file))
10729 return -EBADF;
10730 }
2b188cc1
JA
10731
10732 ret = -EOPNOTSUPP;
33f993da 10733 if (unlikely(f.file->f_op != &io_uring_fops))
2b188cc1
JA
10734 goto out_fput;
10735
10736 ret = -ENXIO;
10737 ctx = f.file->private_data;
33f993da 10738 if (unlikely(!percpu_ref_tryget(&ctx->refs)))
2b188cc1
JA
10739 goto out_fput;
10740
7e84e1c7 10741 ret = -EBADFD;
33f993da 10742 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
7e84e1c7
SG
10743 goto out;
10744
6c271ce2
JA
10745 /*
10746 * For SQ polling, the thread will do all submissions and completions.
10747 * Just return the requested submit count, and wake the thread if
10748 * we were asked to.
10749 */
b2a9eada 10750 ret = 0;
6c271ce2 10751 if (ctx->flags & IORING_SETUP_SQPOLL) {
90f67366 10752 io_cqring_overflow_flush(ctx);
89448c47 10753
21f96522
JA
10754 if (unlikely(ctx->sq_data->thread == NULL)) {
10755 ret = -EOWNERDEAD;
04147488 10756 goto out;
21f96522 10757 }
6c271ce2 10758 if (flags & IORING_ENTER_SQ_WAKEUP)
534ca6d6 10759 wake_up(&ctx->sq_data->wait);
d9d05217
PB
10760 if (flags & IORING_ENTER_SQ_WAIT) {
10761 ret = io_sqpoll_wait_sq(ctx);
10762 if (ret)
10763 goto out;
10764 }
6c271ce2 10765 submitted = to_submit;
b2a9eada 10766 } else if (to_submit) {
eef51daa 10767 ret = io_uring_add_tctx_node(ctx);
0f212204
JA
10768 if (unlikely(ret))
10769 goto out;
d487b43c 10770
2b188cc1 10771 mutex_lock(&ctx->uring_lock);
0f212204 10772 submitted = io_submit_sqes(ctx, to_submit);
d487b43c
PB
10773 if (submitted != to_submit) {
10774 mutex_unlock(&ctx->uring_lock);
7c504e65 10775 goto out;
d487b43c
PB
10776 }
10777 if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
10778 goto iopoll_locked;
10779 mutex_unlock(&ctx->uring_lock);
2b188cc1
JA
10780 }
10781 if (flags & IORING_ENTER_GETEVENTS) {
773697b6 10782 if (ctx->syscall_iopoll) {
d487b43c
PB
10783 /*
10784 * We disallow the app entering submit/complete with
10785 * polling, but we still need to lock the ring to
10786 * prevent racing with polled issue that got punted to
10787 * a workqueue.
10788 */
10789 mutex_lock(&ctx->uring_lock);
10790iopoll_locked:
f81440d3 10791 ret = io_validate_ext_arg(flags, argp, argsz);
d487b43c
PB
10792 if (likely(!ret)) {
10793 min_complete = min(min_complete, ctx->cq_entries);
10794 ret = io_iopoll_check(ctx, min_complete);
10795 }
10796 mutex_unlock(&ctx->uring_lock);
def596e9 10797 } else {
f81440d3
PB
10798 const sigset_t __user *sig;
10799 struct __kernel_timespec __user *ts;
10800
10801 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
10802 if (unlikely(ret))
10803 goto out;
d487b43c 10804 min_complete = min(min_complete, ctx->cq_entries);
c73ebb68 10805 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
def596e9 10806 }
2b188cc1
JA
10807 }
10808
7c504e65 10809out:
6805b32e 10810 percpu_ref_put(&ctx->refs);
2b188cc1 10811out_fput:
e7a6c00d
JA
10812 if (!(flags & IORING_ENTER_REGISTERED_RING))
10813 fdput(f);
2b188cc1
JA
10814 return submitted ? submitted : ret;
10815}
10816
bebdb65e 10817#ifdef CONFIG_PROC_FS
c072481d 10818static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
61cf9370 10819 const struct cred *cred)
87ce955b 10820{
87ce955b
JA
10821 struct user_namespace *uns = seq_user_ns(m);
10822 struct group_info *gi;
10823 kernel_cap_t cap;
10824 unsigned __capi;
10825 int g;
10826
10827 seq_printf(m, "%5d\n", id);
10828 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10829 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10830 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10831 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10832 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10833 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10834 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10835 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10836 seq_puts(m, "\n\tGroups:\t");
10837 gi = cred->group_info;
10838 for (g = 0; g < gi->ngroups; g++) {
10839 seq_put_decimal_ull(m, g ? " " : "",
10840 from_kgid_munged(uns, gi->gid[g]));
10841 }
10842 seq_puts(m, "\n\tCapEff:\t");
10843 cap = cred->cap_effective;
10844 CAP_FOR_EACH_U32(__capi)
10845 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10846 seq_putc(m, '\n');
10847 return 0;
10848}
10849
c072481d
PB
10850static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
10851 struct seq_file *m)
87ce955b 10852{
dbbe9c64 10853 struct io_sq_data *sq = NULL;
83f84356
HX
10854 struct io_overflow_cqe *ocqe;
10855 struct io_rings *r = ctx->rings;
10856 unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
83f84356
HX
10857 unsigned int sq_head = READ_ONCE(r->sq.head);
10858 unsigned int sq_tail = READ_ONCE(r->sq.tail);
10859 unsigned int cq_head = READ_ONCE(r->cq.head);
10860 unsigned int cq_tail = READ_ONCE(r->cq.tail);
f75d1183 10861 unsigned int sq_entries, cq_entries;
fad8e0de 10862 bool has_lock;
83f84356
HX
10863 unsigned int i;
10864
10865 /*
10866 * we may get imprecise sqe and cqe info if uring is actively running
10867 * since we get cached_sq_head and cached_cq_tail without uring_lock
10868 * and sq_tail and cq_head are changed by userspace. But it's ok since
10869 * we usually use these info when it is stuck.
10870 */
c0235652 10871 seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
f75d1183
JA
10872 seq_printf(m, "SqHead:\t%u\n", sq_head);
10873 seq_printf(m, "SqTail:\t%u\n", sq_tail);
10874 seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
10875 seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
10876 seq_printf(m, "CqHead:\t%u\n", cq_head);
10877 seq_printf(m, "CqTail:\t%u\n", cq_tail);
10878 seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
10879 seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
10880 sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
10881 for (i = 0; i < sq_entries; i++) {
10882 unsigned int entry = i + sq_head;
10883 unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
a1957780 10884 struct io_uring_sqe *sqe;
f75d1183
JA
10885
10886 if (sq_idx > sq_mask)
10887 continue;
10888 sqe = &ctx->sq_sqes[sq_idx];
10889 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10890 sq_idx, sqe->opcode, sqe->fd, sqe->flags,
10891 sqe->user_data);
83f84356 10892 }
f75d1183
JA
10893 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
10894 cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
10895 for (i = 0; i < cq_entries; i++) {
10896 unsigned int entry = i + cq_head;
10897 struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
83f84356
HX
10898
10899 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
f75d1183
JA
10900 entry & cq_mask, cqe->user_data, cqe->res,
10901 cqe->flags);
83f84356 10902 }
87ce955b 10903
fad8e0de
JA
10904 /*
10905 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10906 * since fdinfo case grabs it in the opposite direction of normal use
10907 * cases. If we fail to get the lock, we just don't iterate any
10908 * structures that could be going away outside the io_uring mutex.
10909 */
10910 has_lock = mutex_trylock(&ctx->uring_lock);
10911
5f3f26f9 10912 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
dbbe9c64 10913 sq = ctx->sq_data;
5f3f26f9
JA
10914 if (!sq->thread)
10915 sq = NULL;
10916 }
dbbe9c64
JQ
10917
10918 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10919 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
87ce955b 10920 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
fad8e0de 10921 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
7b29f92d 10922 struct file *f = io_file_from_index(ctx, i);
87ce955b 10923
87ce955b
JA
10924 if (f)
10925 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10926 else
10927 seq_printf(m, "%5u: <none>\n", i);
10928 }
10929 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
fad8e0de 10930 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
41edf1a5 10931 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
4751f53d 10932 unsigned int len = buf->ubuf_end - buf->ubuf;
87ce955b 10933
4751f53d 10934 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
87ce955b 10935 }
61cf9370
MWO
10936 if (has_lock && !xa_empty(&ctx->personalities)) {
10937 unsigned long index;
10938 const struct cred *cred;
10939
87ce955b 10940 seq_printf(m, "Personalities:\n");
61cf9370
MWO
10941 xa_for_each(&ctx->personalities, index, cred)
10942 io_uring_show_cred(m, index, cred);
87ce955b 10943 }
83f84356
HX
10944 if (has_lock)
10945 mutex_unlock(&ctx->uring_lock);
10946
10947 seq_puts(m, "PollList:\n");
79ebeaee 10948 spin_lock(&ctx->completion_lock);
d7718a9d
JA
10949 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10950 struct hlist_head *list = &ctx->cancel_hash[i];
10951 struct io_kiocb *req;
10952
10953 hlist_for_each_entry(req, list, hash_node)
10954 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
7f62d40d 10955 task_work_pending(req->task));
d7718a9d 10956 }
83f84356
HX
10957
10958 seq_puts(m, "CqOverflowList:\n");
10959 list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10960 struct io_uring_cqe *cqe = &ocqe->cqe;
10961
10962 seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
10963 cqe->user_data, cqe->res, cqe->flags);
10964
10965 }
10966
79ebeaee 10967 spin_unlock(&ctx->completion_lock);
87ce955b
JA
10968}
10969
c072481d 10970static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
87ce955b
JA
10971{
10972 struct io_ring_ctx *ctx = f->private_data;
10973
10974 if (percpu_ref_tryget(&ctx->refs)) {
10975 __io_uring_show_fdinfo(ctx, m);
10976 percpu_ref_put(&ctx->refs);
10977 }
10978}
bebdb65e 10979#endif
87ce955b 10980
2b188cc1
JA
10981static const struct file_operations io_uring_fops = {
10982 .release = io_uring_release,
10983 .mmap = io_uring_mmap,
6c5c240e
RP
10984#ifndef CONFIG_MMU
10985 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
10986 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
10987#endif
2b188cc1 10988 .poll = io_uring_poll,
bebdb65e 10989#ifdef CONFIG_PROC_FS
87ce955b 10990 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 10991#endif
2b188cc1
JA
10992};
10993
c072481d
PB
10994static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10995 struct io_uring_params *p)
2b188cc1 10996{
75b28aff
HV
10997 struct io_rings *rings;
10998 size_t size, sq_array_offset;
2b188cc1 10999
bd740481
JA
11000 /* make sure these are sane, as we already accounted them */
11001 ctx->sq_entries = p->sq_entries;
11002 ctx->cq_entries = p->cq_entries;
11003
75b28aff
HV
11004 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
11005 if (size == SIZE_MAX)
11006 return -EOVERFLOW;
11007
11008 rings = io_mem_alloc(size);
11009 if (!rings)
2b188cc1
JA
11010 return -ENOMEM;
11011
75b28aff
HV
11012 ctx->rings = rings;
11013 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
11014 rings->sq_ring_mask = p->sq_entries - 1;
11015 rings->cq_ring_mask = p->cq_entries - 1;
11016 rings->sq_ring_entries = p->sq_entries;
11017 rings->cq_ring_entries = p->cq_entries;
2b188cc1
JA
11018
11019 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
11020 if (size == SIZE_MAX) {
11021 io_mem_free(ctx->rings);
11022 ctx->rings = NULL;
2b188cc1 11023 return -EOVERFLOW;
eb065d30 11024 }
2b188cc1
JA
11025
11026 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
11027 if (!ctx->sq_sqes) {
11028 io_mem_free(ctx->rings);
11029 ctx->rings = NULL;
2b188cc1 11030 return -ENOMEM;
eb065d30 11031 }
2b188cc1 11032
2b188cc1
JA
11033 return 0;
11034}
11035
9faadcc8
PB
11036static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
11037{
11038 int ret, fd;
11039
11040 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
11041 if (fd < 0)
11042 return fd;
11043
eef51daa 11044 ret = io_uring_add_tctx_node(ctx);
9faadcc8
PB
11045 if (ret) {
11046 put_unused_fd(fd);
11047 return ret;
11048 }
11049 fd_install(fd, file);
11050 return fd;
11051}
11052
2b188cc1
JA
11053/*
11054 * Allocate an anonymous fd, this is what constitutes the application
11055 * visible backing of an io_uring instance. The application mmaps this
11056 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
11057 * we have to tie this fd to a socket for file garbage collection purposes.
11058 */
9faadcc8 11059static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
2b188cc1
JA
11060{
11061 struct file *file;
9faadcc8 11062#if defined(CONFIG_UNIX)
2b188cc1
JA
11063 int ret;
11064
2b188cc1
JA
11065 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
11066 &ctx->ring_sock);
11067 if (ret)
9faadcc8 11068 return ERR_PTR(ret);
2b188cc1
JA
11069#endif
11070
91a9ab7c
PM
11071 file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
11072 O_RDWR | O_CLOEXEC, NULL);
2b188cc1 11073#if defined(CONFIG_UNIX)
9faadcc8
PB
11074 if (IS_ERR(file)) {
11075 sock_release(ctx->ring_sock);
11076 ctx->ring_sock = NULL;
11077 } else {
11078 ctx->ring_sock->file = file;
0f212204 11079 }
2b188cc1 11080#endif
9faadcc8 11081 return file;
2b188cc1
JA
11082}
11083
c072481d
PB
11084static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
11085 struct io_uring_params __user *params)
2b188cc1 11086{
2b188cc1 11087 struct io_ring_ctx *ctx;
9faadcc8 11088 struct file *file;
2b188cc1
JA
11089 int ret;
11090
8110c1a6 11091 if (!entries)
2b188cc1 11092 return -EINVAL;
8110c1a6
JA
11093 if (entries > IORING_MAX_ENTRIES) {
11094 if (!(p->flags & IORING_SETUP_CLAMP))
11095 return -EINVAL;
11096 entries = IORING_MAX_ENTRIES;
11097 }
2b188cc1
JA
11098
11099 /*
11100 * Use twice as many entries for the CQ ring. It's possible for the
11101 * application to drive a higher depth than the size of the SQ ring,
11102 * since the sqes are only used at submission time. This allows for
33a107f0
JA
11103 * some flexibility in overcommitting a bit. If the application has
11104 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
11105 * of CQ ring entries manually.
2b188cc1
JA
11106 */
11107 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
11108 if (p->flags & IORING_SETUP_CQSIZE) {
11109 /*
11110 * If IORING_SETUP_CQSIZE is set, we do the same roundup
11111 * to a power-of-two, if it isn't already. We do NOT impose
11112 * any cq vs sq ring sizing.
11113 */
eb2667b3 11114 if (!p->cq_entries)
33a107f0 11115 return -EINVAL;
8110c1a6
JA
11116 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
11117 if (!(p->flags & IORING_SETUP_CLAMP))
11118 return -EINVAL;
11119 p->cq_entries = IORING_MAX_CQ_ENTRIES;
11120 }
eb2667b3
JQ
11121 p->cq_entries = roundup_pow_of_two(p->cq_entries);
11122 if (p->cq_entries < p->sq_entries)
11123 return -EINVAL;
33a107f0
JA
11124 } else {
11125 p->cq_entries = 2 * p->sq_entries;
11126 }
2b188cc1 11127
2b188cc1 11128 ctx = io_ring_ctx_alloc(p);
62e398be 11129 if (!ctx)
2b188cc1 11130 return -ENOMEM;
773697b6
PB
11131
11132 /*
11133 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
11134 * space applications don't need to do io completion events
11135 * polling again, they can rely on io_sq_thread to do polling
11136 * work, which can reduce cpu usage and uring_lock contention.
11137 */
11138 if (ctx->flags & IORING_SETUP_IOPOLL &&
11139 !(ctx->flags & IORING_SETUP_SQPOLL))
11140 ctx->syscall_iopoll = 1;
11141
2b188cc1 11142 ctx->compat = in_compat_syscall();
62e398be
JA
11143 if (!capable(CAP_IPC_LOCK))
11144 ctx->user = get_uid(current_user());
2aede0e4
JA
11145
11146 /*
11147 * This is just grabbed for accounting purposes. When a process exits,
11148 * the mm is exited and dropped before the files, hence we need to hang
11149 * on to this mm purely for the purposes of being able to unaccount
11150 * memory (locked/pinned vm). It's not used for anything else.
11151 */
6b7898eb 11152 mmgrab(current->mm);
2aede0e4 11153 ctx->mm_account = current->mm;
6b7898eb 11154
2b188cc1
JA
11155 ret = io_allocate_scq_urings(ctx, p);
11156 if (ret)
11157 goto err;
11158
7e84e1c7 11159 ret = io_sq_offload_create(ctx, p);
2b188cc1
JA
11160 if (ret)
11161 goto err;
eae071c9 11162 /* always set a rsrc node */
47b228ce
PB
11163 ret = io_rsrc_node_switch_start(ctx);
11164 if (ret)
11165 goto err;
eae071c9 11166 io_rsrc_node_switch(ctx, NULL);
2b188cc1 11167
2b188cc1 11168 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
11169 p->sq_off.head = offsetof(struct io_rings, sq.head);
11170 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
11171 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
11172 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
11173 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
11174 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
11175 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
11176
11177 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
11178 p->cq_off.head = offsetof(struct io_rings, cq.head);
11179 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
11180 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
11181 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
11182 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
11183 p->cq_off.cqes = offsetof(struct io_rings, cqes);
0d9b5b3a 11184 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
ac90f249 11185
7f13657d
XW
11186 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
11187 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
5769a351 11188 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
c73ebb68 11189 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9690557e 11190 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
c4212f3e
JA
11191 IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
11192 IORING_FEAT_LINKED_FILE;
7f13657d
XW
11193
11194 if (copy_to_user(params, p, sizeof(*p))) {
11195 ret = -EFAULT;
11196 goto err;
11197 }
d1719f70 11198
9faadcc8
PB
11199 file = io_uring_get_file(ctx);
11200 if (IS_ERR(file)) {
11201 ret = PTR_ERR(file);
11202 goto err;
11203 }
11204
044c1ab3
JA
11205 /*
11206 * Install ring fd as the very last thing, so we don't risk someone
11207 * having closed it before we finish setup
11208 */
9faadcc8
PB
11209 ret = io_uring_install_fd(ctx, file);
11210 if (ret < 0) {
11211 /* fput will clean it up */
11212 fput(file);
11213 return ret;
11214 }
044c1ab3 11215
c826bd7a 11216 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
11217 return ret;
11218err:
11219 io_ring_ctx_wait_and_kill(ctx);
11220 return ret;
11221}
11222
11223/*
11224 * Sets up an aio uring context, and returns the fd. Applications asks for a
11225 * ring size, we return the actual sq/cq ring sizes (among other things) in the
11226 * params structure passed in.
11227 */
11228static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
11229{
11230 struct io_uring_params p;
2b188cc1
JA
11231 int i;
11232
11233 if (copy_from_user(&p, params, sizeof(p)))
11234 return -EFAULT;
11235 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
11236 if (p.resv[i])
11237 return -EINVAL;
11238 }
11239
6c271ce2 11240 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 11241 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7e84e1c7 11242 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
bcbb7bf6 11243 IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
2b188cc1
JA
11244 return -EINVAL;
11245
7f13657d 11246 return io_uring_create(entries, &p, params);
2b188cc1
JA
11247}
11248
11249SYSCALL_DEFINE2(io_uring_setup, u32, entries,
11250 struct io_uring_params __user *, params)
11251{
11252 return io_uring_setup(entries, params);
11253}
11254
c072481d
PB
11255static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
11256 unsigned nr_args)
66f4af93
JA
11257{
11258 struct io_uring_probe *p;
11259 size_t size;
11260 int i, ret;
11261
11262 size = struct_size(p, ops, nr_args);
11263 if (size == SIZE_MAX)
11264 return -EOVERFLOW;
11265 p = kzalloc(size, GFP_KERNEL);
11266 if (!p)
11267 return -ENOMEM;
11268
11269 ret = -EFAULT;
11270 if (copy_from_user(p, arg, size))
11271 goto out;
11272 ret = -EINVAL;
11273 if (memchr_inv(p, 0, size))
11274 goto out;
11275
11276 p->last_op = IORING_OP_LAST - 1;
11277 if (nr_args > IORING_OP_LAST)
11278 nr_args = IORING_OP_LAST;
11279
11280 for (i = 0; i < nr_args; i++) {
11281 p->ops[i].op = i;
11282 if (!io_op_defs[i].not_supported)
11283 p->ops[i].flags = IO_URING_OP_SUPPORTED;
11284 }
11285 p->ops_len = i;
11286
11287 ret = 0;
11288 if (copy_to_user(arg, p, size))
11289 ret = -EFAULT;
11290out:
11291 kfree(p);
11292 return ret;
11293}
11294
071698e1
JA
11295static int io_register_personality(struct io_ring_ctx *ctx)
11296{
4379bf8b 11297 const struct cred *creds;
61cf9370 11298 u32 id;
1e6fa521 11299 int ret;
071698e1 11300
4379bf8b 11301 creds = get_current_cred();
1e6fa521 11302
61cf9370
MWO
11303 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
11304 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
a30f895a
JA
11305 if (ret < 0) {
11306 put_cred(creds);
11307 return ret;
11308 }
11309 return id;
071698e1
JA
11310}
11311
c072481d
PB
11312static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
11313 void __user *arg, unsigned int nr_args)
21b55dbc
SG
11314{
11315 struct io_uring_restriction *res;
11316 size_t size;
11317 int i, ret;
11318
7e84e1c7
SG
11319 /* Restrictions allowed only if rings started disabled */
11320 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
11321 return -EBADFD;
11322
21b55dbc 11323 /* We allow only a single restrictions registration */
7e84e1c7 11324 if (ctx->restrictions.registered)
21b55dbc
SG
11325 return -EBUSY;
11326
11327 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
11328 return -EINVAL;
11329
11330 size = array_size(nr_args, sizeof(*res));
11331 if (size == SIZE_MAX)
11332 return -EOVERFLOW;
11333
11334 res = memdup_user(arg, size);
11335 if (IS_ERR(res))
11336 return PTR_ERR(res);
11337
11338 ret = 0;
11339
11340 for (i = 0; i < nr_args; i++) {
11341 switch (res[i].opcode) {
11342 case IORING_RESTRICTION_REGISTER_OP:
11343 if (res[i].register_op >= IORING_REGISTER_LAST) {
11344 ret = -EINVAL;
11345 goto out;
11346 }
11347
11348 __set_bit(res[i].register_op,
11349 ctx->restrictions.register_op);
11350 break;
11351 case IORING_RESTRICTION_SQE_OP:
11352 if (res[i].sqe_op >= IORING_OP_LAST) {
11353 ret = -EINVAL;
11354 goto out;
11355 }
11356
11357 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
11358 break;
11359 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
11360 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
11361 break;
11362 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
11363 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
11364 break;
11365 default:
11366 ret = -EINVAL;
11367 goto out;
11368 }
11369 }
11370
11371out:
11372 /* Reset all restrictions if an error happened */
11373 if (ret != 0)
11374 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
11375 else
7e84e1c7 11376 ctx->restrictions.registered = true;
21b55dbc
SG
11377
11378 kfree(res);
11379 return ret;
11380}
11381
7e84e1c7
SG
11382static int io_register_enable_rings(struct io_ring_ctx *ctx)
11383{
11384 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
11385 return -EBADFD;
11386
11387 if (ctx->restrictions.registered)
11388 ctx->restricted = 1;
11389
0298ef96
PB
11390 ctx->flags &= ~IORING_SETUP_R_DISABLED;
11391 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
11392 wake_up(&ctx->sq_data->wait);
7e84e1c7
SG
11393 return 0;
11394}
11395
fdecb662 11396static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 11397 struct io_uring_rsrc_update2 *up,
98f0b3b4
PB
11398 unsigned nr_args)
11399{
11400 __u32 tmp;
11401 int err;
11402
11403 if (check_add_overflow(up->offset, nr_args, &tmp))
11404 return -EOVERFLOW;
11405 err = io_rsrc_node_switch_start(ctx);
11406 if (err)
11407 return err;
11408
fdecb662
PB
11409 switch (type) {
11410 case IORING_RSRC_FILE:
98f0b3b4 11411 return __io_sqe_files_update(ctx, up, nr_args);
634d00df
PB
11412 case IORING_RSRC_BUFFER:
11413 return __io_sqe_buffers_update(ctx, up, nr_args);
98f0b3b4
PB
11414 }
11415 return -EINVAL;
11416}
11417
c3bdad02
PB
11418static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
11419 unsigned nr_args)
98f0b3b4 11420{
c3bdad02 11421 struct io_uring_rsrc_update2 up;
98f0b3b4
PB
11422
11423 if (!nr_args)
11424 return -EINVAL;
c3bdad02
PB
11425 memset(&up, 0, sizeof(up));
11426 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
11427 return -EFAULT;
d8a3ba9c 11428 if (up.resv || up.resv2)
565c5e61 11429 return -EINVAL;
c3bdad02
PB
11430 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
11431}
11432
11433static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
992da01a 11434 unsigned size, unsigned type)
c3bdad02
PB
11435{
11436 struct io_uring_rsrc_update2 up;
11437
11438 if (size != sizeof(up))
11439 return -EINVAL;
98f0b3b4
PB
11440 if (copy_from_user(&up, arg, sizeof(up)))
11441 return -EFAULT;
d8a3ba9c 11442 if (!up.nr || up.resv || up.resv2)
98f0b3b4 11443 return -EINVAL;
992da01a 11444 return __io_register_rsrc_update(ctx, type, &up, up.nr);
98f0b3b4
PB
11445}
11446
c072481d 11447static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
992da01a 11448 unsigned int size, unsigned int type)
792e3582
PB
11449{
11450 struct io_uring_rsrc_register rr;
11451
11452 /* keep it extendible */
11453 if (size != sizeof(rr))
11454 return -EINVAL;
11455
11456 memset(&rr, 0, sizeof(rr));
11457 if (copy_from_user(&rr, arg, size))
11458 return -EFAULT;
992da01a 11459 if (!rr.nr || rr.resv || rr.resv2)
792e3582
PB
11460 return -EINVAL;
11461
992da01a 11462 switch (type) {
792e3582
PB
11463 case IORING_RSRC_FILE:
11464 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
11465 rr.nr, u64_to_user_ptr(rr.tags));
634d00df
PB
11466 case IORING_RSRC_BUFFER:
11467 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
11468 rr.nr, u64_to_user_ptr(rr.tags));
792e3582
PB
11469 }
11470 return -EINVAL;
11471}
11472
c072481d
PB
11473static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
11474 void __user *arg, unsigned len)
fe76421d
JA
11475{
11476 struct io_uring_task *tctx = current->io_uring;
11477 cpumask_var_t new_mask;
11478 int ret;
11479
11480 if (!tctx || !tctx->io_wq)
11481 return -EINVAL;
11482
11483 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
11484 return -ENOMEM;
11485
11486 cpumask_clear(new_mask);
11487 if (len > cpumask_size())
11488 len = cpumask_size();
11489
0f5e4b83
ES
11490 if (in_compat_syscall()) {
11491 ret = compat_get_bitmap(cpumask_bits(new_mask),
11492 (const compat_ulong_t __user *)arg,
11493 len * 8 /* CHAR_BIT */);
11494 } else {
11495 ret = copy_from_user(new_mask, arg, len);
11496 }
11497
11498 if (ret) {
fe76421d
JA
11499 free_cpumask_var(new_mask);
11500 return -EFAULT;
11501 }
11502
11503 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
11504 free_cpumask_var(new_mask);
11505 return ret;
11506}
11507
c072481d 11508static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
fe76421d
JA
11509{
11510 struct io_uring_task *tctx = current->io_uring;
11511
11512 if (!tctx || !tctx->io_wq)
11513 return -EINVAL;
11514
11515 return io_wq_cpu_affinity(tctx->io_wq, NULL);
11516}
11517
c072481d
PB
11518static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
11519 void __user *arg)
b22fa62a 11520 __must_hold(&ctx->uring_lock)
2e480058 11521{
b22fa62a 11522 struct io_tctx_node *node;
fa84693b
JA
11523 struct io_uring_task *tctx = NULL;
11524 struct io_sq_data *sqd = NULL;
2e480058
JA
11525 __u32 new_count[2];
11526 int i, ret;
11527
2e480058
JA
11528 if (copy_from_user(new_count, arg, sizeof(new_count)))
11529 return -EFAULT;
11530 for (i = 0; i < ARRAY_SIZE(new_count); i++)
11531 if (new_count[i] > INT_MAX)
11532 return -EINVAL;
11533
fa84693b
JA
11534 if (ctx->flags & IORING_SETUP_SQPOLL) {
11535 sqd = ctx->sq_data;
11536 if (sqd) {
009ad9f0
JA
11537 /*
11538 * Observe the correct sqd->lock -> ctx->uring_lock
11539 * ordering. Fine to drop uring_lock here, we hold
11540 * a ref to the ctx.
11541 */
41d3a6bd 11542 refcount_inc(&sqd->refs);
009ad9f0 11543 mutex_unlock(&ctx->uring_lock);
fa84693b 11544 mutex_lock(&sqd->lock);
009ad9f0 11545 mutex_lock(&ctx->uring_lock);
41d3a6bd
JA
11546 if (sqd->thread)
11547 tctx = sqd->thread->io_uring;
fa84693b
JA
11548 }
11549 } else {
11550 tctx = current->io_uring;
11551 }
11552
e139a1ec 11553 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
fa84693b 11554
bad119b9
PB
11555 for (i = 0; i < ARRAY_SIZE(new_count); i++)
11556 if (new_count[i])
11557 ctx->iowq_limits[i] = new_count[i];
e139a1ec
PB
11558 ctx->iowq_limits_set = true;
11559
e139a1ec
PB
11560 if (tctx && tctx->io_wq) {
11561 ret = io_wq_max_workers(tctx->io_wq, new_count);
11562 if (ret)
11563 goto err;
11564 } else {
11565 memset(new_count, 0, sizeof(new_count));
11566 }
fa84693b 11567
41d3a6bd 11568 if (sqd) {
fa84693b 11569 mutex_unlock(&sqd->lock);
41d3a6bd
JA
11570 io_put_sq_data(sqd);
11571 }
2e480058
JA
11572
11573 if (copy_to_user(arg, new_count, sizeof(new_count)))
11574 return -EFAULT;
11575
b22fa62a
PB
11576 /* that's it for SQPOLL, only the SQPOLL task creates requests */
11577 if (sqd)
11578 return 0;
11579
11580 /* now propagate the restriction to all registered users */
11581 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
11582 struct io_uring_task *tctx = node->task->io_uring;
11583
11584 if (WARN_ON_ONCE(!tctx->io_wq))
11585 continue;
11586
11587 for (i = 0; i < ARRAY_SIZE(new_count); i++)
11588 new_count[i] = ctx->iowq_limits[i];
11589 /* ignore errors, it always returns zero anyway */
11590 (void)io_wq_max_workers(tctx->io_wq, new_count);
11591 }
2e480058 11592 return 0;
fa84693b 11593err:
41d3a6bd 11594 if (sqd) {
fa84693b 11595 mutex_unlock(&sqd->lock);
41d3a6bd
JA
11596 io_put_sq_data(sqd);
11597 }
fa84693b 11598 return ret;
2e480058
JA
11599}
11600
edafccee
JA
11601static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
11602 void __user *arg, unsigned nr_args)
b19062a5
JA
11603 __releases(ctx->uring_lock)
11604 __acquires(ctx->uring_lock)
edafccee
JA
11605{
11606 int ret;
11607
35fa71a0
JA
11608 /*
11609 * We're inside the ring mutex, if the ref is already dying, then
11610 * someone else killed the ctx or is already going through
11611 * io_uring_register().
11612 */
11613 if (percpu_ref_is_dying(&ctx->refs))
11614 return -ENXIO;
11615
75c4021a
PB
11616 if (ctx->restricted) {
11617 if (opcode >= IORING_REGISTER_LAST)
11618 return -EINVAL;
11619 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
11620 if (!test_bit(opcode, ctx->restrictions.register_op))
11621 return -EACCES;
11622 }
11623
edafccee
JA
11624 switch (opcode) {
11625 case IORING_REGISTER_BUFFERS:
634d00df 11626 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
edafccee
JA
11627 break;
11628 case IORING_UNREGISTER_BUFFERS:
11629 ret = -EINVAL;
11630 if (arg || nr_args)
11631 break;
0a96bbe4 11632 ret = io_sqe_buffers_unregister(ctx);
edafccee 11633 break;
6b06314c 11634 case IORING_REGISTER_FILES:
792e3582 11635 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
6b06314c
JA
11636 break;
11637 case IORING_UNREGISTER_FILES:
11638 ret = -EINVAL;
11639 if (arg || nr_args)
11640 break;
11641 ret = io_sqe_files_unregister(ctx);
11642 break;
c3a31e60 11643 case IORING_REGISTER_FILES_UPDATE:
c3bdad02 11644 ret = io_register_files_update(ctx, arg, nr_args);
c3a31e60 11645 break;
9b402849
JA
11646 case IORING_REGISTER_EVENTFD:
11647 ret = -EINVAL;
11648 if (nr_args != 1)
11649 break;
c75312dd
UA
11650 ret = io_eventfd_register(ctx, arg, 0);
11651 break;
11652 case IORING_REGISTER_EVENTFD_ASYNC:
11653 ret = -EINVAL;
11654 if (nr_args != 1)
f2842ab5 11655 break;
c75312dd 11656 ret = io_eventfd_register(ctx, arg, 1);
9b402849
JA
11657 break;
11658 case IORING_UNREGISTER_EVENTFD:
11659 ret = -EINVAL;
11660 if (arg || nr_args)
11661 break;
11662 ret = io_eventfd_unregister(ctx);
11663 break;
66f4af93
JA
11664 case IORING_REGISTER_PROBE:
11665 ret = -EINVAL;
11666 if (!arg || nr_args > 256)
11667 break;
11668 ret = io_probe(ctx, arg, nr_args);
11669 break;
071698e1
JA
11670 case IORING_REGISTER_PERSONALITY:
11671 ret = -EINVAL;
11672 if (arg || nr_args)
11673 break;
11674 ret = io_register_personality(ctx);
11675 break;
11676 case IORING_UNREGISTER_PERSONALITY:
11677 ret = -EINVAL;
11678 if (arg)
11679 break;
11680 ret = io_unregister_personality(ctx, nr_args);
11681 break;
7e84e1c7
SG
11682 case IORING_REGISTER_ENABLE_RINGS:
11683 ret = -EINVAL;
11684 if (arg || nr_args)
11685 break;
11686 ret = io_register_enable_rings(ctx);
11687 break;
21b55dbc
SG
11688 case IORING_REGISTER_RESTRICTIONS:
11689 ret = io_register_restrictions(ctx, arg, nr_args);
11690 break;
992da01a
PB
11691 case IORING_REGISTER_FILES2:
11692 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
11693 break;
11694 case IORING_REGISTER_FILES_UPDATE2:
11695 ret = io_register_rsrc_update(ctx, arg, nr_args,
11696 IORING_RSRC_FILE);
11697 break;
11698 case IORING_REGISTER_BUFFERS2:
11699 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
792e3582 11700 break;
992da01a
PB
11701 case IORING_REGISTER_BUFFERS_UPDATE:
11702 ret = io_register_rsrc_update(ctx, arg, nr_args,
11703 IORING_RSRC_BUFFER);
c3bdad02 11704 break;
fe76421d
JA
11705 case IORING_REGISTER_IOWQ_AFF:
11706 ret = -EINVAL;
11707 if (!arg || !nr_args)
11708 break;
11709 ret = io_register_iowq_aff(ctx, arg, nr_args);
11710 break;
11711 case IORING_UNREGISTER_IOWQ_AFF:
11712 ret = -EINVAL;
11713 if (arg || nr_args)
11714 break;
11715 ret = io_unregister_iowq_aff(ctx);
11716 break;
2e480058
JA
11717 case IORING_REGISTER_IOWQ_MAX_WORKERS:
11718 ret = -EINVAL;
11719 if (!arg || nr_args != 2)
11720 break;
11721 ret = io_register_iowq_max_workers(ctx, arg);
11722 break;
e7a6c00d
JA
11723 case IORING_REGISTER_RING_FDS:
11724 ret = io_ringfd_register(ctx, arg, nr_args);
11725 break;
11726 case IORING_UNREGISTER_RING_FDS:
11727 ret = io_ringfd_unregister(ctx, arg, nr_args);
11728 break;
edafccee
JA
11729 default:
11730 ret = -EINVAL;
11731 break;
11732 }
11733
edafccee
JA
11734 return ret;
11735}
11736
11737SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
11738 void __user *, arg, unsigned int, nr_args)
11739{
11740 struct io_ring_ctx *ctx;
11741 long ret = -EBADF;
11742 struct fd f;
11743
11744 f = fdget(fd);
11745 if (!f.file)
11746 return -EBADF;
11747
11748 ret = -EOPNOTSUPP;
11749 if (f.file->f_op != &io_uring_fops)
11750 goto out_fput;
11751
11752 ctx = f.file->private_data;
11753
b6c23dd5
PB
11754 io_run_task_work();
11755
edafccee
JA
11756 mutex_lock(&ctx->uring_lock);
11757 ret = __io_uring_register(ctx, opcode, arg, nr_args);
11758 mutex_unlock(&ctx->uring_lock);
2757be22 11759 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
edafccee
JA
11760out_fput:
11761 fdput(f);
11762 return ret;
11763}
11764
2b188cc1
JA
11765static int __init io_uring_init(void)
11766{
d7f62e82
SM
11767#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11768 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11769 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11770} while (0)
11771
11772#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11773 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11774 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
11775 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
11776 BUILD_BUG_SQE_ELEM(1, __u8, flags);
11777 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
11778 BUILD_BUG_SQE_ELEM(4, __s32, fd);
11779 BUILD_BUG_SQE_ELEM(8, __u64, off);
11780 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
11781 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 11782 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
11783 BUILD_BUG_SQE_ELEM(24, __u32, len);
11784 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
11785 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
11786 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
11787 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
5769a351
JX
11788 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
11789 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
d7f62e82
SM
11790 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
11791 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
11792 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
11793 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
11794 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
11795 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
11796 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
11797 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 11798 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
11799 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
11800 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
16340eab 11801 BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
d7f62e82 11802 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 11803 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
b9445598 11804 BUILD_BUG_SQE_ELEM(44, __u32, file_index);
d7f62e82 11805
b0d658ec
PB
11806 BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11807 sizeof(struct io_uring_rsrc_update));
11808 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11809 sizeof(struct io_uring_rsrc_update2));
90499ad0
PB
11810
11811 /* ->buf_index is u16 */
11812 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11813
b0d658ec
PB
11814 /* should fit into one byte */
11815 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
68fe256a
PB
11816 BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
11817 BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
b0d658ec 11818
d3656344 11819 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
32c2d33e 11820 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
16340eab 11821
91f245d5
JA
11822 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11823 SLAB_ACCOUNT);
2b188cc1
JA
11824 return 0;
11825};
11826__initcall(io_uring_init);