io_uring: use right helpers for file assign locking
[linux-2.6-block.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
d068b506 14 * through a control-dependency in io_get_cqe (smp_store_release to
1e84b97b
SB
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
52de1fe1 47#include <net/compat.h>
2b188cc1
JA
48#include <linux/refcount.h>
49#include <linux/uio.h>
6b47ee6e 50#include <linux/bits.h>
2b188cc1
JA
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
2b188cc1
JA
58#include <linux/percpu.h>
59#include <linux/slab.h>
edce22e1 60#include <linux/blk-mq.h>
edafccee 61#include <linux/bvec.h>
2b188cc1
JA
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
6b06314c 65#include <net/scm.h>
2b188cc1
JA
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
edafccee
JA
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
aa4c3967 72#include <linux/highmem.h>
15b71abe
JA
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
4840e418 75#include <linux/fadvise.h>
3e4827b0 76#include <linux/eventpoll.h>
7d67af2c 77#include <linux/splice.h>
b41e9852 78#include <linux/task_work.h>
bcf5a063 79#include <linux/pagemap.h>
0f212204 80#include <linux/io_uring.h>
5bd2182d 81#include <linux/audit.h>
cdc1404a 82#include <linux/security.h>
2b188cc1 83
c826bd7a
DD
84#define CREATE_TRACE_POINTS
85#include <trace/events/io_uring.h>
86
2b188cc1
JA
87#include <uapi/linux/io_uring.h>
88
89#include "internal.h"
561fb04a 90#include "io-wq.h"
2b188cc1 91
5277deaa 92#define IORING_MAX_ENTRIES 32768
33a107f0 93#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
4ce8ad95 94#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
65e19f54 95
187f08c1 96/* only define max */
042b0d85 97#define IORING_MAX_FIXED_FILES (1U << 15)
21b55dbc
SG
98#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
99 IORING_REGISTER_LAST + IORING_OP_LAST)
2b188cc1 100
187f08c1 101#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
2d091d62
PB
102#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
103#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
104
489809e2
PB
105#define IORING_MAX_REG_BUFFERS (1U << 14)
106
68fe256a
PB
107#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
108 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
109
5562a8d7
PB
110#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
111 IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
68fe256a 112
c854357b 113#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
d5361233 114 REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
b16fed66 115
a538be5b
PB
116#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
117 IO_REQ_CLEAN_FLAGS)
118
09899b19
PB
119#define IO_TCTX_REFS_CACHE_NR (1U << 10)
120
2b188cc1
JA
121struct io_uring {
122 u32 head ____cacheline_aligned_in_smp;
123 u32 tail ____cacheline_aligned_in_smp;
124};
125
1e84b97b 126/*
75b28aff
HV
127 * This data is shared with the application through the mmap at offsets
128 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
129 *
130 * The offsets to the member fields are published through struct
131 * io_sqring_offsets when calling io_uring_setup.
132 */
75b28aff 133struct io_rings {
1e84b97b
SB
134 /*
135 * Head and tail offsets into the ring; the offsets need to be
136 * masked to get valid indices.
137 *
75b28aff
HV
138 * The kernel controls head of the sq ring and the tail of the cq ring,
139 * and the application controls tail of the sq ring and the head of the
140 * cq ring.
1e84b97b 141 */
75b28aff 142 struct io_uring sq, cq;
1e84b97b 143 /*
75b28aff 144 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
145 * ring_entries - 1)
146 */
75b28aff
HV
147 u32 sq_ring_mask, cq_ring_mask;
148 /* Ring sizes (constant, power of 2) */
149 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
150 /*
151 * Number of invalid entries dropped by the kernel due to
152 * invalid index stored in array
153 *
154 * Written by the kernel, shouldn't be modified by the
155 * application (i.e. get number of "new events" by comparing to
156 * cached value).
157 *
158 * After a new SQ head value was read by the application this
159 * counter includes all submissions that were dropped reaching
160 * the new SQ head (and possibly more).
161 */
75b28aff 162 u32 sq_dropped;
1e84b97b 163 /*
0d9b5b3a 164 * Runtime SQ flags
1e84b97b
SB
165 *
166 * Written by the kernel, shouldn't be modified by the
167 * application.
168 *
169 * The application needs a full memory barrier before checking
170 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
171 */
75b28aff 172 u32 sq_flags;
0d9b5b3a
SG
173 /*
174 * Runtime CQ flags
175 *
176 * Written by the application, shouldn't be modified by the
177 * kernel.
178 */
fe7e3257 179 u32 cq_flags;
1e84b97b
SB
180 /*
181 * Number of completion events lost because the queue was full;
182 * this should be avoided by the application by making sure
0b4295b5 183 * there are not more requests pending than there is space in
1e84b97b
SB
184 * the completion queue.
185 *
186 * Written by the kernel, shouldn't be modified by the
187 * application (i.e. get number of "new events" by comparing to
188 * cached value).
189 *
190 * As completion events come in out of order this counter is not
191 * ordered with any other data.
192 */
75b28aff 193 u32 cq_overflow;
1e84b97b
SB
194 /*
195 * Ring buffer of completion events.
196 *
197 * The kernel writes completion events fresh every time they are
198 * produced, so the application is allowed to modify pending
199 * entries.
200 */
75b28aff 201 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
202};
203
45d189c6 204enum io_uring_cmd_flags {
51aac424 205 IO_URING_F_COMPLETE_DEFER = 1,
3b44b371 206 IO_URING_F_UNLOCKED = 2,
51aac424
PB
207 /* int's last bit, sign checks are usually faster than a bit test */
208 IO_URING_F_NONBLOCK = INT_MIN,
45d189c6
PB
209};
210
edafccee
JA
211struct io_mapped_ubuf {
212 u64 ubuf;
4751f53d 213 u64 ubuf_end;
edafccee 214 unsigned int nr_bvecs;
de293938 215 unsigned long acct_pages;
41edf1a5 216 struct bio_vec bvec[];
edafccee
JA
217};
218
50238531
BM
219struct io_ring_ctx;
220
6c2450ae
PB
221struct io_overflow_cqe {
222 struct io_uring_cqe cqe;
223 struct list_head list;
224};
225
a04b0ac0
PB
226struct io_fixed_file {
227 /* file * with additional FFS_* flags */
228 unsigned long file_ptr;
229};
230
269bbe5f
BM
231struct io_rsrc_put {
232 struct list_head list;
b60c8dce 233 u64 tag;
50238531
BM
234 union {
235 void *rsrc;
236 struct file *file;
bd54b6fe 237 struct io_mapped_ubuf *buf;
50238531 238 };
269bbe5f
BM
239};
240
aeca241b 241struct io_file_table {
042b0d85 242 struct io_fixed_file *files;
31b51510
JA
243};
244
b895c9a6 245struct io_rsrc_node {
05589553
XW
246 struct percpu_ref refs;
247 struct list_head node;
269bbe5f 248 struct list_head rsrc_list;
b895c9a6 249 struct io_rsrc_data *rsrc_data;
4a38aed2 250 struct llist_node llist;
e297822b 251 bool done;
05589553
XW
252};
253
40ae0ff7
PB
254typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
255
b895c9a6 256struct io_rsrc_data {
05f3fb3c
JA
257 struct io_ring_ctx *ctx;
258
2d091d62
PB
259 u64 **tags;
260 unsigned int nr;
40ae0ff7 261 rsrc_put_fn *do_put;
3e942498 262 atomic_t refs;
05f3fb3c 263 struct completion done;
8bad28d8 264 bool quiesce;
05f3fb3c
JA
265};
266
dbc7d452
JA
267struct io_buffer_list {
268 struct list_head list;
269 struct list_head buf_list;
270 __u16 bgid;
271};
272
5a2e745d
JA
273struct io_buffer {
274 struct list_head list;
275 __u64 addr;
d1f82808 276 __u32 len;
5a2e745d 277 __u16 bid;
b1c62645 278 __u16 bgid;
5a2e745d
JA
279};
280
21b55dbc
SG
281struct io_restriction {
282 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
283 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
284 u8 sqe_flags_allowed;
285 u8 sqe_flags_required;
7e84e1c7 286 bool registered;
21b55dbc
SG
287};
288
37d1e2e3
JA
289enum {
290 IO_SQ_THREAD_SHOULD_STOP = 0,
291 IO_SQ_THREAD_SHOULD_PARK,
292};
293
534ca6d6
JA
294struct io_sq_data {
295 refcount_t refs;
9e138a48 296 atomic_t park_pending;
09a6f4ef 297 struct mutex lock;
69fb2131
JA
298
299 /* ctx's that are using this sqd */
300 struct list_head ctx_list;
69fb2131 301
534ca6d6
JA
302 struct task_struct *thread;
303 struct wait_queue_head wait;
08369246
XW
304
305 unsigned sq_thread_idle;
37d1e2e3
JA
306 int sq_cpu;
307 pid_t task_pid;
5c2469e0 308 pid_t task_tgid;
37d1e2e3
JA
309
310 unsigned long state;
37d1e2e3 311 struct completion exited;
534ca6d6
JA
312};
313
6dd0be1e 314#define IO_COMPL_BATCH 32
6ff119a6 315#define IO_REQ_CACHE_SIZE 32
bf019da7 316#define IO_REQ_ALLOC_BATCH 8
258b29a9 317
a1ab7b35
PB
318struct io_submit_link {
319 struct io_kiocb *head;
320 struct io_kiocb *last;
321};
322
258b29a9 323struct io_submit_state {
5a158c6b
PB
324 /* inline/task_work completion list, under ->uring_lock */
325 struct io_wq_work_node free_list;
326 /* batch completion logic */
327 struct io_wq_work_list compl_reqs;
a1ab7b35 328 struct io_submit_link link;
258b29a9 329
258b29a9 330 bool plug_started;
4b628aeb 331 bool need_plug;
3d4aeb9f 332 bool flush_cqes;
5ca7a8b3 333 unsigned short submit_nr;
5a158c6b 334 struct blk_plug plug;
258b29a9
PB
335};
336
77bc59b4
UA
337struct io_ev_fd {
338 struct eventfd_ctx *cq_ev_fd;
c75312dd 339 unsigned int eventfd_async: 1;
77bc59b4
UA
340 struct rcu_head rcu;
341};
342
dbc7d452
JA
343#define IO_BUFFERS_HASH_BITS 5
344
2b188cc1 345struct io_ring_ctx {
b52ecf8c 346 /* const or read-mostly hot data */
2b188cc1
JA
347 struct {
348 struct percpu_ref refs;
2b188cc1 349
b52ecf8c 350 struct io_rings *rings;
2b188cc1 351 unsigned int flags;
e1d85334 352 unsigned int compat: 1;
e1d85334 353 unsigned int drain_next: 1;
21b55dbc 354 unsigned int restricted: 1;
f18ee4cf 355 unsigned int off_timeout_used: 1;
10c66904 356 unsigned int drain_active: 1;
5562a8d7 357 unsigned int drain_disabled: 1;
9aa8dfde 358 unsigned int has_evfd: 1;
773697b6 359 unsigned int syscall_iopoll: 1;
b52ecf8c 360 } ____cacheline_aligned_in_smp;
2b188cc1 361
7f1129d2 362 /* submission data */
b52ecf8c 363 struct {
0499e582
PB
364 struct mutex uring_lock;
365
75b28aff
HV
366 /*
367 * Ring buffer of indices into array of io_uring_sqe, which is
368 * mmapped by the application using the IORING_OFF_SQES offset.
369 *
370 * This indirection could e.g. be used to assign fixed
371 * io_uring_sqe entries to operations and only submit them to
372 * the queue when needed.
373 *
374 * The kernel modifies neither the indices array nor the entries
375 * array.
376 */
377 u32 *sq_array;
c7af47cf 378 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
379 unsigned cached_sq_head;
380 unsigned sq_entries;
de0617e4 381 struct list_head defer_list;
7f1129d2
PB
382
383 /*
384 * Fixed resources fast path, should be accessed only under
385 * uring_lock, and updated through io_uring_register(2)
386 */
387 struct io_rsrc_node *rsrc_node;
ab409402 388 int rsrc_cached_refs;
7f1129d2
PB
389 struct io_file_table file_table;
390 unsigned nr_user_files;
391 unsigned nr_user_bufs;
392 struct io_mapped_ubuf **user_bufs;
393
394 struct io_submit_state submit_state;
5262f567 395 struct list_head timeout_list;
ef9dd637 396 struct list_head ltimeout_list;
1d7bb1d5 397 struct list_head cq_overflow_list;
dbc7d452 398 struct list_head *io_buffers;
cc3cec83 399 struct list_head io_buffers_cache;
4d9237e3 400 struct list_head apoll_cache;
7f1129d2
PB
401 struct xarray personalities;
402 u32 pers_next;
403 unsigned sq_thread_idle;
2b188cc1
JA
404 } ____cacheline_aligned_in_smp;
405
d0acdee2 406 /* IRQ completion list, under ->completion_lock */
c2b6c6bc 407 struct io_wq_work_list locked_free_list;
d0acdee2 408 unsigned int locked_free_nr;
3c1a2ead 409
7c30f36a 410 const struct cred *sq_creds; /* cred used for __io_sq_thread() */
534ca6d6
JA
411 struct io_sq_data *sq_data; /* if using sq thread polling */
412
90554200 413 struct wait_queue_head sqo_sq_wait;
69fb2131 414 struct list_head sqd_list;
75b28aff 415
5ed7a37d
PB
416 unsigned long check_cq_overflow;
417
206aefde 418 struct {
d8da428b
PB
419 /*
420 * We cache a range of free CQEs we can use, once exhausted it
421 * should go through a slower range setup, see __io_get_cqe()
422 */
423 struct io_uring_cqe *cqe_cached;
424 struct io_uring_cqe *cqe_sentinel;
425
206aefde
JA
426 unsigned cached_cq_tail;
427 unsigned cq_entries;
77bc59b4 428 struct io_ev_fd __rcu *io_ev_fd;
0499e582
PB
429 struct wait_queue_head cq_wait;
430 unsigned cq_extra;
431 atomic_t cq_timeouts;
0499e582 432 unsigned cq_last_tm_flush;
206aefde 433 } ____cacheline_aligned_in_smp;
2b188cc1 434
2b188cc1
JA
435 struct {
436 spinlock_t completion_lock;
e94f141b 437
89850fce
JA
438 spinlock_t timeout_lock;
439
def596e9 440 /*
540e32a0 441 * ->iopoll_list is protected by the ctx->uring_lock for
def596e9
JA
442 * io_uring instances that don't use IORING_SETUP_SQPOLL.
443 * For SQPOLL, only the single threaded io_sq_thread() will
444 * manipulate the list, hence no extra locking is needed there.
445 */
5eef4e87 446 struct io_wq_work_list iopoll_list;
78076bb6
JA
447 struct hlist_head *cancel_hash;
448 unsigned cancel_hash_bits;
915b3dde 449 bool poll_multi_queue;
cc3cec83
JA
450
451 struct list_head io_buffers_comp;
2b188cc1 452 } ____cacheline_aligned_in_smp;
85faa7b8 453
21b55dbc 454 struct io_restriction restrictions;
3c1a2ead 455
b13a8918
PB
456 /* slow path rsrc auxilary data, used by update/register */
457 struct {
458 struct io_rsrc_node *rsrc_backup_node;
459 struct io_mapped_ubuf *dummy_ubuf;
460 struct io_rsrc_data *file_data;
461 struct io_rsrc_data *buf_data;
462
463 struct delayed_work rsrc_put_work;
464 struct llist_head rsrc_put_llist;
465 struct list_head rsrc_ref_list;
466 spinlock_t rsrc_ref_lock;
cc3cec83
JA
467
468 struct list_head io_buffers_pages;
b13a8918
PB
469 };
470
3c1a2ead 471 /* Keep this last, we don't need it for the fast path */
b986af7e
PB
472 struct {
473 #if defined(CONFIG_UNIX)
474 struct socket *ring_sock;
475 #endif
476 /* hashed buffered write serialization */
477 struct io_wq_hash *hash_map;
478
479 /* Only used for accounting purposes */
480 struct user_struct *user;
481 struct mm_struct *mm_account;
482
483 /* ctx exit and cancelation */
9011bf9a
PB
484 struct llist_head fallback_llist;
485 struct delayed_work fallback_work;
b986af7e
PB
486 struct work_struct exit_work;
487 struct list_head tctx_list;
488 struct completion ref_comp;
e139a1ec
PB
489 u32 iowq_limits[2];
490 bool iowq_limits_set;
b986af7e 491 };
2b188cc1
JA
492};
493
e7a6c00d
JA
494/*
495 * Arbitrary limit, can be raised if need be
496 */
497#define IO_RINGFD_REG_MAX 16
498
53e043b2
SM
499struct io_uring_task {
500 /* submission side */
09899b19 501 int cached_refs;
53e043b2
SM
502 struct xarray xa;
503 struct wait_queue_head wait;
ee53fb2b
SM
504 const struct io_ring_ctx *last;
505 struct io_wq *io_wq;
53e043b2
SM
506 struct percpu_counter inflight;
507 atomic_t in_idle;
53e043b2
SM
508
509 spinlock_t task_lock;
510 struct io_wq_work_list task_list;
4813c377 511 struct io_wq_work_list prior_task_list;
53e043b2 512 struct callback_head task_work;
e7a6c00d 513 struct file **registered_rings;
6294f368 514 bool task_running;
53e043b2
SM
515};
516
09bb8394
JA
517/*
518 * First field must be the file pointer in all the
519 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
520 */
221c5eb2
JA
521struct io_poll_iocb {
522 struct file *file;
018043be 523 struct wait_queue_head *head;
221c5eb2 524 __poll_t events;
392edb45 525 struct wait_queue_entry wait;
221c5eb2
JA
526};
527
9d805892 528struct io_poll_update {
018043be 529 struct file *file;
9d805892
PB
530 u64 old_user_data;
531 u64 new_user_data;
532 __poll_t events;
b69de288
JA
533 bool update_events;
534 bool update_user_data;
018043be
PB
535};
536
b5dba59e
JA
537struct io_close {
538 struct file *file;
b5dba59e 539 int fd;
7df778be 540 u32 file_slot;
b5dba59e
JA
541};
542
ad8a48ac
JA
543struct io_timeout_data {
544 struct io_kiocb *req;
545 struct hrtimer timer;
546 struct timespec64 ts;
547 enum hrtimer_mode mode;
50c1df2b 548 u32 flags;
ad8a48ac
JA
549};
550
8ed8d3c3
JA
551struct io_accept {
552 struct file *file;
553 struct sockaddr __user *addr;
554 int __user *addr_len;
555 int flags;
aaa4db12 556 u32 file_slot;
09952e3e 557 unsigned long nofile;
8ed8d3c3
JA
558};
559
560struct io_sync {
561 struct file *file;
562 loff_t len;
563 loff_t off;
564 int flags;
d63d1b5e 565 int mode;
8ed8d3c3
JA
566};
567
fbf23849
JA
568struct io_cancel {
569 struct file *file;
570 u64 addr;
571};
572
b29472ee
JA
573struct io_timeout {
574 struct file *file;
bfe68a22
PB
575 u32 off;
576 u32 target_seq;
135fcde8 577 struct list_head list;
90cd7e42
PB
578 /* head of the link, used by linked timeouts only */
579 struct io_kiocb *head;
89b263f6
JA
580 /* for linked completions */
581 struct io_kiocb *prev;
b29472ee
JA
582};
583
0bdf7a2d
PB
584struct io_timeout_rem {
585 struct file *file;
586 u64 addr;
9c8e11b3
PB
587
588 /* timeout update */
589 struct timespec64 ts;
590 u32 flags;
f1042b6c 591 bool ltimeout;
0bdf7a2d
PB
592};
593
9adbd45d
JA
594struct io_rw {
595 /* NOTE: kiocb has the file as the first member, so don't do it here */
596 struct kiocb kiocb;
597 u64 addr;
584b0180
JA
598 u32 len;
599 u32 flags;
9adbd45d
JA
600};
601
3fbb51c1
JA
602struct io_connect {
603 struct file *file;
604 struct sockaddr __user *addr;
605 int addr_len;
606};
607
e47293fd
JA
608struct io_sr_msg {
609 struct file *file;
fddaface 610 union {
4af3417a
PB
611 struct compat_msghdr __user *umsg_compat;
612 struct user_msghdr __user *umsg;
613 void __user *buf;
fddaface 614 };
e47293fd 615 int msg_flags;
bcda7baa 616 int bgid;
fddaface 617 size_t len;
7ba89d2a 618 size_t done_io;
e47293fd
JA
619};
620
15b71abe
JA
621struct io_open {
622 struct file *file;
623 int dfd;
b9445598 624 u32 file_slot;
15b71abe 625 struct filename *filename;
c12cedf2 626 struct open_how how;
4022e7af 627 unsigned long nofile;
15b71abe
JA
628};
629
269bbe5f 630struct io_rsrc_update {
05f3fb3c
JA
631 struct file *file;
632 u64 arg;
633 u32 nr_args;
634 u32 offset;
635};
636
4840e418
JA
637struct io_fadvise {
638 struct file *file;
639 u64 offset;
640 u32 len;
641 u32 advice;
642};
643
c1ca757b
JA
644struct io_madvise {
645 struct file *file;
646 u64 addr;
647 u32 len;
648 u32 advice;
649};
650
3e4827b0
JA
651struct io_epoll {
652 struct file *file;
653 int epfd;
654 int op;
655 int fd;
656 struct epoll_event event;
e47293fd
JA
657};
658
7d67af2c
PB
659struct io_splice {
660 struct file *file_out;
7d67af2c
PB
661 loff_t off_out;
662 loff_t off_in;
663 u64 len;
a3e4bc23 664 int splice_fd_in;
7d67af2c
PB
665 unsigned int flags;
666};
667
ddf0322d
JA
668struct io_provide_buf {
669 struct file *file;
670 __u64 addr;
38134ada 671 __u32 len;
ddf0322d
JA
672 __u32 bgid;
673 __u16 nbufs;
674 __u16 bid;
675};
676
1d9e1288
BM
677struct io_statx {
678 struct file *file;
679 int dfd;
680 unsigned int mask;
681 unsigned int flags;
1b6fe6e0 682 struct filename *filename;
1d9e1288
BM
683 struct statx __user *buffer;
684};
685
36f4fa68
JA
686struct io_shutdown {
687 struct file *file;
688 int how;
689};
690
80a261fd
JA
691struct io_rename {
692 struct file *file;
693 int old_dfd;
694 int new_dfd;
695 struct filename *oldpath;
696 struct filename *newpath;
697 int flags;
698};
699
14a1143b
JA
700struct io_unlink {
701 struct file *file;
702 int dfd;
703 int flags;
704 struct filename *filename;
705};
706
e34a02dc
DK
707struct io_mkdir {
708 struct file *file;
709 int dfd;
710 umode_t mode;
711 struct filename *filename;
712};
713
7a8721f8
DK
714struct io_symlink {
715 struct file *file;
716 int new_dfd;
717 struct filename *oldpath;
718 struct filename *newpath;
719};
720
cf30da90
DK
721struct io_hardlink {
722 struct file *file;
723 int old_dfd;
724 int new_dfd;
725 struct filename *oldpath;
726 struct filename *newpath;
727 int flags;
728};
729
4f57f06c
JA
730struct io_msg {
731 struct file *file;
732 u64 user_data;
733 u32 len;
734};
735
f499a021
JA
736struct io_async_connect {
737 struct sockaddr_storage address;
738};
739
03b1230c
JA
740struct io_async_msghdr {
741 struct iovec fast_iov[UIO_FASTIOV];
257e84a5
PB
742 /* points to an allocated iov, if NULL we use fast_iov instead */
743 struct iovec *free_iov;
03b1230c
JA
744 struct sockaddr __user *uaddr;
745 struct msghdr msg;
b537916c 746 struct sockaddr_storage addr;
03b1230c
JA
747};
748
538941e2 749struct io_rw_state {
ff6165b2 750 struct iov_iter iter;
cd658695 751 struct iov_iter_state iter_state;
c88598a9 752 struct iovec fast_iov[UIO_FASTIOV];
538941e2
PB
753};
754
755struct io_async_rw {
756 struct io_rw_state s;
757 const struct iovec *free_iovec;
227c0c96 758 size_t bytes_done;
bcf5a063 759 struct wait_page_queue wpq;
f67676d1
JA
760};
761
6b47ee6e
PB
762enum {
763 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
764 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
765 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
766 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
767 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
bcda7baa 768 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
04c76b41 769 REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
6b47ee6e 770
dddca226 771 /* first byte is taken by user flags, shift it to not overlap */
93d2bcd2 772 REQ_F_FAIL_BIT = 8,
6b47ee6e
PB
773 REQ_F_INFLIGHT_BIT,
774 REQ_F_CUR_POS_BIT,
775 REQ_F_NOWAIT_BIT,
6b47ee6e 776 REQ_F_LINK_TIMEOUT_BIT,
99bc4c38 777 REQ_F_NEED_CLEANUP_BIT,
d7718a9d 778 REQ_F_POLLED_BIT,
bcda7baa 779 REQ_F_BUFFER_SELECTED_BIT,
e342c807 780 REQ_F_COMPLETE_INLINE_BIT,
230d50d4 781 REQ_F_REISSUE_BIT,
b8e64b53 782 REQ_F_CREDS_BIT,
20e60a38 783 REQ_F_REFCOUNT_BIT,
4d13d1a4 784 REQ_F_ARM_LTIMEOUT_BIT,
d886e185 785 REQ_F_ASYNC_DATA_BIT,
04c76b41 786 REQ_F_SKIP_LINK_CQES_BIT,
91eac1c6
JA
787 REQ_F_SINGLE_POLL_BIT,
788 REQ_F_DOUBLE_POLL_BIT,
8a3e8ee5 789 REQ_F_PARTIAL_IO_BIT,
7b29f92d 790 /* keep async read/write and isreg together and in order */
35645ac3 791 REQ_F_SUPPORT_NOWAIT_BIT,
7b29f92d 792 REQ_F_ISREG_BIT,
84557871
JA
793
794 /* not a real bit, just to check we're not overflowing the space */
795 __REQ_F_LAST_BIT,
6b47ee6e
PB
796};
797
798enum {
799 /* ctx owns file */
800 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
801 /* drain existing IO first */
802 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
803 /* linked sqes */
804 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
805 /* doesn't sever on completion < 0 */
806 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
807 /* IOSQE_ASYNC */
808 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
bcda7baa
JA
809 /* IOSQE_BUFFER_SELECT */
810 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
04c76b41
PB
811 /* IOSQE_CQE_SKIP_SUCCESS */
812 REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
6b47ee6e 813
6b47ee6e 814 /* fail rest of links */
93d2bcd2 815 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
b05a1bcd 816 /* on inflight list, should be cancelled and waited on exit reliably */
6b47ee6e
PB
817 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
818 /* read/write uses file position */
819 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
820 /* must not punt to workers */
821 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
900fad45 822 /* has or had linked timeout */
6b47ee6e 823 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
99bc4c38
PB
824 /* needs cleanup */
825 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
d7718a9d
JA
826 /* already went through poll handler */
827 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
bcda7baa
JA
828 /* buffer already selected */
829 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
e342c807
PB
830 /* completion is deferred through io_comp_state */
831 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
230d50d4
JA
832 /* caller should reissue async */
833 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
35645ac3
PB
834 /* supports async reads/writes */
835 REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
7b29f92d
JA
836 /* regular file */
837 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
b8e64b53
PB
838 /* has creds assigned */
839 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
20e60a38
PB
840 /* skip refcounting if not set */
841 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
4d13d1a4
PB
842 /* there is a linked timeout that has to be armed */
843 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
d886e185
PB
844 /* ->async_data allocated */
845 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
04c76b41
PB
846 /* don't post CQEs while failing linked requests */
847 REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
91eac1c6
JA
848 /* single poll may be active */
849 REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
850 /* double poll may active */
851 REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
8a3e8ee5
JA
852 /* request has already done partial IO */
853 REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
d7718a9d
JA
854};
855
856struct async_poll {
857 struct io_poll_iocb poll;
807abcb0 858 struct io_poll_iocb *double_poll;
6b47ee6e
PB
859};
860
f237c30a 861typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
5b0a6acc 862
7cbf1722 863struct io_task_work {
5b0a6acc
PB
864 union {
865 struct io_wq_work_node node;
866 struct llist_node fallback_node;
867 };
868 io_req_tw_func_t func;
7cbf1722
JA
869};
870
992da01a
PB
871enum {
872 IORING_RSRC_FILE = 0,
873 IORING_RSRC_BUFFER = 1,
874};
875
cef216fc
PB
876struct io_cqe {
877 __u64 user_data;
878 __s32 res;
879 /* fd initially, then cflags for completion */
880 union {
881 __u32 flags;
882 int fd;
883 };
884};
885
09bb8394
JA
886/*
887 * NOTE! Each of the iocb union members has the file pointer
888 * as the first entry in their struct definition. So you can
889 * access the file pointer through any of the sub-structs,
63c36549 890 * or directly as just 'file' in this struct.
09bb8394 891 */
2b188cc1 892struct io_kiocb {
221c5eb2 893 union {
09bb8394 894 struct file *file;
9adbd45d 895 struct io_rw rw;
221c5eb2 896 struct io_poll_iocb poll;
9d805892 897 struct io_poll_update poll_update;
8ed8d3c3
JA
898 struct io_accept accept;
899 struct io_sync sync;
fbf23849 900 struct io_cancel cancel;
b29472ee 901 struct io_timeout timeout;
0bdf7a2d 902 struct io_timeout_rem timeout_rem;
3fbb51c1 903 struct io_connect connect;
e47293fd 904 struct io_sr_msg sr_msg;
15b71abe 905 struct io_open open;
b5dba59e 906 struct io_close close;
269bbe5f 907 struct io_rsrc_update rsrc_update;
4840e418 908 struct io_fadvise fadvise;
c1ca757b 909 struct io_madvise madvise;
3e4827b0 910 struct io_epoll epoll;
7d67af2c 911 struct io_splice splice;
ddf0322d 912 struct io_provide_buf pbuf;
1d9e1288 913 struct io_statx statx;
36f4fa68 914 struct io_shutdown shutdown;
80a261fd 915 struct io_rename rename;
14a1143b 916 struct io_unlink unlink;
e34a02dc 917 struct io_mkdir mkdir;
7a8721f8 918 struct io_symlink symlink;
cf30da90 919 struct io_hardlink hardlink;
4f57f06c 920 struct io_msg msg;
221c5eb2 921 };
2b188cc1 922
d625c6ee 923 u8 opcode;
65a6543d
XW
924 /* polled IO has completed */
925 u8 iopoll_completed;
4f4eeba8 926 u16 buf_index;
d17e56eb
PB
927 unsigned int flags;
928
cef216fc 929 struct io_cqe cqe;
4f4eeba8 930
010e8e6b 931 struct io_ring_ctx *ctx;
010e8e6b 932 struct task_struct *task;
d7718a9d 933
269bbe5f 934 struct percpu_ref *fixed_rsrc_refs;
d886e185
PB
935 /* store used ubuf, so we can prevent reloading */
936 struct io_mapped_ubuf *imu;
fcb323cc 937
2804ecd8
JA
938 union {
939 /* used by request caches, completion batching and iopoll */
940 struct io_wq_work_node comp_list;
941 /* cache ->apoll->events */
942 int apoll_events;
943 };
d17e56eb 944 atomic_t refs;
521d61fc 945 atomic_t poll_refs;
5b0a6acc 946 struct io_task_work io_task_work;
010e8e6b
PB
947 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
948 struct hlist_node hash_node;
7e3709d5 949 /* internal polling, see IORING_FEAT_FAST_POLL */
010e8e6b 950 struct async_poll *apoll;
d886e185
PB
951 /* opcode allocated if it needs to store data for async defer */
952 void *async_data;
7e3709d5 953 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
30d51dd4 954 struct io_buffer *kbuf;
41cdcc22 955 /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
34d2bfe7 956 struct io_kiocb *link;
41cdcc22 957 /* custom credentials, valid IFF REQ_F_CREDS is set */
521d61fc
JA
958 const struct cred *creds;
959 struct io_wq_work work;
2b188cc1 960};
05589553 961
13bf43f5
PB
962struct io_tctx_node {
963 struct list_head ctx_node;
964 struct task_struct *task;
13bf43f5
PB
965 struct io_ring_ctx *ctx;
966};
967
27dc8338
PB
968struct io_defer_entry {
969 struct list_head list;
970 struct io_kiocb *req;
9cf7c104 971 u32 seq;
2b188cc1
JA
972};
973
d3656344 974struct io_op_def {
d3656344
JA
975 /* needs req->file assigned */
976 unsigned needs_file : 1;
6d63416d
PB
977 /* should block plug */
978 unsigned plug : 1;
d3656344
JA
979 /* hash wq insertion if file is a regular file */
980 unsigned hash_reg_file : 1;
981 /* unbound wq insertion if file is a non-regular file */
982 unsigned unbound_nonreg_file : 1;
8a72758c
JA
983 /* set if opcode supports polled "wait" */
984 unsigned pollin : 1;
985 unsigned pollout : 1;
52dd8640 986 unsigned poll_exclusive : 1;
bcda7baa
JA
987 /* op supports buffer selection */
988 unsigned buffer_select : 1;
26f0505a
PB
989 /* do prep async if is going to be punted */
990 unsigned needs_async_setup : 1;
6d63416d
PB
991 /* opcode is not supported by this kernel */
992 unsigned not_supported : 1;
5bd2182d
PM
993 /* skip auditing */
994 unsigned audit_skip : 1;
e8c2bc1f
JA
995 /* size of async data needed, if any */
996 unsigned short async_size;
d3656344
JA
997};
998
0918682b 999static const struct io_op_def io_op_defs[] = {
0463b6c5
PB
1000 [IORING_OP_NOP] = {},
1001 [IORING_OP_READV] = {
d3656344
JA
1002 .needs_file = 1,
1003 .unbound_nonreg_file = 1,
8a72758c 1004 .pollin = 1,
4d954c25 1005 .buffer_select = 1,
26f0505a 1006 .needs_async_setup = 1,
27926b68 1007 .plug = 1,
5bd2182d 1008 .audit_skip = 1,
e8c2bc1f 1009 .async_size = sizeof(struct io_async_rw),
d3656344 1010 },
0463b6c5 1011 [IORING_OP_WRITEV] = {
d3656344
JA
1012 .needs_file = 1,
1013 .hash_reg_file = 1,
1014 .unbound_nonreg_file = 1,
8a72758c 1015 .pollout = 1,
26f0505a 1016 .needs_async_setup = 1,
27926b68 1017 .plug = 1,
5bd2182d 1018 .audit_skip = 1,
e8c2bc1f 1019 .async_size = sizeof(struct io_async_rw),
d3656344 1020 },
0463b6c5 1021 [IORING_OP_FSYNC] = {
d3656344 1022 .needs_file = 1,
5bd2182d 1023 .audit_skip = 1,
d3656344 1024 },
0463b6c5 1025 [IORING_OP_READ_FIXED] = {
d3656344
JA
1026 .needs_file = 1,
1027 .unbound_nonreg_file = 1,
8a72758c 1028 .pollin = 1,
27926b68 1029 .plug = 1,
5bd2182d 1030 .audit_skip = 1,
e8c2bc1f 1031 .async_size = sizeof(struct io_async_rw),
d3656344 1032 },
0463b6c5 1033 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
1034 .needs_file = 1,
1035 .hash_reg_file = 1,
1036 .unbound_nonreg_file = 1,
8a72758c 1037 .pollout = 1,
27926b68 1038 .plug = 1,
5bd2182d 1039 .audit_skip = 1,
e8c2bc1f 1040 .async_size = sizeof(struct io_async_rw),
d3656344 1041 },
0463b6c5 1042 [IORING_OP_POLL_ADD] = {
d3656344
JA
1043 .needs_file = 1,
1044 .unbound_nonreg_file = 1,
5bd2182d
PM
1045 .audit_skip = 1,
1046 },
1047 [IORING_OP_POLL_REMOVE] = {
1048 .audit_skip = 1,
d3656344 1049 },
0463b6c5 1050 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344 1051 .needs_file = 1,
5bd2182d 1052 .audit_skip = 1,
d3656344 1053 },
0463b6c5 1054 [IORING_OP_SENDMSG] = {
d3656344
JA
1055 .needs_file = 1,
1056 .unbound_nonreg_file = 1,
8a72758c 1057 .pollout = 1,
26f0505a 1058 .needs_async_setup = 1,
e8c2bc1f 1059 .async_size = sizeof(struct io_async_msghdr),
d3656344 1060 },
0463b6c5 1061 [IORING_OP_RECVMSG] = {
d3656344
JA
1062 .needs_file = 1,
1063 .unbound_nonreg_file = 1,
8a72758c 1064 .pollin = 1,
52de1fe1 1065 .buffer_select = 1,
26f0505a 1066 .needs_async_setup = 1,
e8c2bc1f 1067 .async_size = sizeof(struct io_async_msghdr),
d3656344 1068 },
0463b6c5 1069 [IORING_OP_TIMEOUT] = {
5bd2182d 1070 .audit_skip = 1,
e8c2bc1f 1071 .async_size = sizeof(struct io_timeout_data),
d3656344 1072 },
9c8e11b3
PB
1073 [IORING_OP_TIMEOUT_REMOVE] = {
1074 /* used by timeout updates' prep() */
5bd2182d 1075 .audit_skip = 1,
9c8e11b3 1076 },
0463b6c5 1077 [IORING_OP_ACCEPT] = {
d3656344
JA
1078 .needs_file = 1,
1079 .unbound_nonreg_file = 1,
8a72758c 1080 .pollin = 1,
52dd8640 1081 .poll_exclusive = 1,
d3656344 1082 },
5bd2182d
PM
1083 [IORING_OP_ASYNC_CANCEL] = {
1084 .audit_skip = 1,
1085 },
0463b6c5 1086 [IORING_OP_LINK_TIMEOUT] = {
5bd2182d 1087 .audit_skip = 1,
e8c2bc1f 1088 .async_size = sizeof(struct io_timeout_data),
d3656344 1089 },
0463b6c5 1090 [IORING_OP_CONNECT] = {
d3656344
JA
1091 .needs_file = 1,
1092 .unbound_nonreg_file = 1,
8a72758c 1093 .pollout = 1,
26f0505a 1094 .needs_async_setup = 1,
e8c2bc1f 1095 .async_size = sizeof(struct io_async_connect),
d3656344 1096 },
0463b6c5 1097 [IORING_OP_FALLOCATE] = {
d3656344 1098 .needs_file = 1,
d3656344 1099 },
44526bed
JA
1100 [IORING_OP_OPENAT] = {},
1101 [IORING_OP_CLOSE] = {},
5bd2182d
PM
1102 [IORING_OP_FILES_UPDATE] = {
1103 .audit_skip = 1,
1104 },
1105 [IORING_OP_STATX] = {
1106 .audit_skip = 1,
1107 },
0463b6c5 1108 [IORING_OP_READ] = {
3a6820f2
JA
1109 .needs_file = 1,
1110 .unbound_nonreg_file = 1,
8a72758c 1111 .pollin = 1,
bcda7baa 1112 .buffer_select = 1,
27926b68 1113 .plug = 1,
5bd2182d 1114 .audit_skip = 1,
e8c2bc1f 1115 .async_size = sizeof(struct io_async_rw),
3a6820f2 1116 },
0463b6c5 1117 [IORING_OP_WRITE] = {
3a6820f2 1118 .needs_file = 1,
7b3188e7 1119 .hash_reg_file = 1,
3a6820f2 1120 .unbound_nonreg_file = 1,
8a72758c 1121 .pollout = 1,
27926b68 1122 .plug = 1,
5bd2182d 1123 .audit_skip = 1,
e8c2bc1f 1124 .async_size = sizeof(struct io_async_rw),
3a6820f2 1125 },
0463b6c5 1126 [IORING_OP_FADVISE] = {
4840e418 1127 .needs_file = 1,
5bd2182d 1128 .audit_skip = 1,
c1ca757b 1129 },
44526bed 1130 [IORING_OP_MADVISE] = {},
0463b6c5 1131 [IORING_OP_SEND] = {
fddaface
JA
1132 .needs_file = 1,
1133 .unbound_nonreg_file = 1,
8a72758c 1134 .pollout = 1,
5bd2182d 1135 .audit_skip = 1,
fddaface 1136 },
0463b6c5 1137 [IORING_OP_RECV] = {
fddaface
JA
1138 .needs_file = 1,
1139 .unbound_nonreg_file = 1,
8a72758c 1140 .pollin = 1,
bcda7baa 1141 .buffer_select = 1,
5bd2182d 1142 .audit_skip = 1,
fddaface 1143 },
0463b6c5 1144 [IORING_OP_OPENAT2] = {
cebdb986 1145 },
3e4827b0
JA
1146 [IORING_OP_EPOLL_CTL] = {
1147 .unbound_nonreg_file = 1,
5bd2182d 1148 .audit_skip = 1,
3e4827b0 1149 },
7d67af2c
PB
1150 [IORING_OP_SPLICE] = {
1151 .needs_file = 1,
1152 .hash_reg_file = 1,
1153 .unbound_nonreg_file = 1,
5bd2182d
PM
1154 .audit_skip = 1,
1155 },
1156 [IORING_OP_PROVIDE_BUFFERS] = {
1157 .audit_skip = 1,
1158 },
1159 [IORING_OP_REMOVE_BUFFERS] = {
1160 .audit_skip = 1,
ddf0322d 1161 },
f2a8d5c7
PB
1162 [IORING_OP_TEE] = {
1163 .needs_file = 1,
1164 .hash_reg_file = 1,
1165 .unbound_nonreg_file = 1,
5bd2182d 1166 .audit_skip = 1,
f2a8d5c7 1167 },
36f4fa68
JA
1168 [IORING_OP_SHUTDOWN] = {
1169 .needs_file = 1,
1170 },
44526bed
JA
1171 [IORING_OP_RENAMEAT] = {},
1172 [IORING_OP_UNLINKAT] = {},
e34a02dc 1173 [IORING_OP_MKDIRAT] = {},
7a8721f8 1174 [IORING_OP_SYMLINKAT] = {},
cf30da90 1175 [IORING_OP_LINKAT] = {},
4f57f06c
JA
1176 [IORING_OP_MSG_RING] = {
1177 .needs_file = 1,
1178 },
d3656344
JA
1179};
1180
0756a869
PB
1181/* requests with any of those set should undergo io_disarm_next() */
1182#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
da1a08c5 1183#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
0756a869 1184
7a612350 1185static bool io_disarm_next(struct io_kiocb *req);
eef51daa 1186static void io_uring_del_tctx_node(unsigned long index);
9936c7c2
PB
1187static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1188 struct task_struct *task,
3dd0c97a 1189 bool cancel_all);
78cc687b 1190static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1ffc5422 1191
4e118cd9 1192static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
c7dae4ba 1193static void io_dismantle_req(struct io_kiocb *req);
94ae5e77 1194static void io_queue_linked_timeout(struct io_kiocb *req);
fdecb662 1195static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 1196 struct io_uring_rsrc_update2 *up,
98f0b3b4 1197 unsigned nr_args);
68fb8979 1198static void io_clean_op(struct io_kiocb *req);
5106dd6e
JA
1199static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
1200 unsigned issue_flags);
1201static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
d5361233
JA
1202static void io_drop_inflight_file(struct io_kiocb *req);
1203static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
cbc2e203 1204static void io_queue_sqe(struct io_kiocb *req);
269bbe5f 1205static void io_rsrc_put_work(struct work_struct *work);
de0617e4 1206
907d1df3 1207static void io_req_task_queue(struct io_kiocb *req);
c450178d 1208static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
179ae0d1 1209static int io_req_prep_async(struct io_kiocb *req);
de0617e4 1210
b9445598
PB
1211static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1212 unsigned int issue_flags, u32 slot_index);
7df778be
PB
1213static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1214
f1042b6c 1215static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
9aa8dfde 1216static void io_eventfd_signal(struct io_ring_ctx *ctx);
4e118cd9 1217static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
b9445598 1218
2b188cc1
JA
1219static struct kmem_cache *req_cachep;
1220
0918682b 1221static const struct file_operations io_uring_fops;
2b188cc1
JA
1222
1223struct sock *io_uring_get_socket(struct file *file)
1224{
1225#if defined(CONFIG_UNIX)
1226 if (file->f_op == &io_uring_fops) {
1227 struct io_ring_ctx *ctx = file->private_data;
1228
1229 return ctx->ring_sock->sk;
1230 }
1231#endif
1232 return NULL;
1233}
1234EXPORT_SYMBOL(io_uring_get_socket);
1235
1f59bc0f
PB
1236#if defined(CONFIG_UNIX)
1237static inline bool io_file_need_scm(struct file *filp)
1238{
1239 return !!unix_get_socket(filp);
1240}
1241#else
1242static inline bool io_file_need_scm(struct file *filp)
1243{
1244 return 0;
1245}
1246#endif
1247
f8929630
PB
1248static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
1249{
1250 lockdep_assert_held(&ctx->uring_lock);
1251 if (issue_flags & IO_URING_F_UNLOCKED)
1252 mutex_unlock(&ctx->uring_lock);
1253}
1254
1255static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
1256{
1257 /*
1258 * "Normal" inline submissions always hold the uring_lock, since we
1259 * grab it from the system call. Same is true for the SQPOLL offload.
1260 * The only exception is when we've detached the request and issue it
1261 * from an async worker thread, grab the lock for that case.
1262 */
1263 if (issue_flags & IO_URING_F_UNLOCKED)
1264 mutex_lock(&ctx->uring_lock);
1265 lockdep_assert_held(&ctx->uring_lock);
1266}
1267
f237c30a
PB
1268static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1269{
1270 if (!*locked) {
1271 mutex_lock(&ctx->uring_lock);
1272 *locked = true;
1273 }
1274}
1275
f2f87370
PB
1276#define io_for_each_link(pos, head) \
1277 for (pos = (head); pos; pos = pos->link)
1278
21c843d5
PB
1279/*
1280 * Shamelessly stolen from the mm implementation of page reference checking,
1281 * see commit f958d7b528b1 for details.
1282 */
1283#define req_ref_zero_or_close_to_overflow(req) \
1284 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1285
1286static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1287{
20e60a38 1288 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1289 return atomic_inc_not_zero(&req->refs);
1290}
1291
21c843d5
PB
1292static inline bool req_ref_put_and_test(struct io_kiocb *req)
1293{
20e60a38
PB
1294 if (likely(!(req->flags & REQ_F_REFCOUNT)))
1295 return true;
1296
21c843d5
PB
1297 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1298 return atomic_dec_and_test(&req->refs);
1299}
1300
21c843d5
PB
1301static inline void req_ref_get(struct io_kiocb *req)
1302{
20e60a38 1303 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1304 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1305 atomic_inc(&req->refs);
1306}
1307
c450178d
PB
1308static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1309{
6f33b0bc 1310 if (!wq_list_empty(&ctx->submit_state.compl_reqs))
c450178d
PB
1311 __io_submit_flush_completions(ctx);
1312}
1313
48dcd38d 1314static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
20e60a38
PB
1315{
1316 if (!(req->flags & REQ_F_REFCOUNT)) {
1317 req->flags |= REQ_F_REFCOUNT;
48dcd38d 1318 atomic_set(&req->refs, nr);
20e60a38
PB
1319 }
1320}
1321
48dcd38d
PB
1322static inline void io_req_set_refcount(struct io_kiocb *req)
1323{
1324 __io_req_set_refcount(req, 1);
1325}
1326
ab409402
PB
1327#define IO_RSRC_REF_BATCH 100
1328
1329static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
1330 struct io_ring_ctx *ctx)
1331 __must_hold(&ctx->uring_lock)
36f72fe2 1332{
ab409402
PB
1333 struct percpu_ref *ref = req->fixed_rsrc_refs;
1334
1335 if (ref) {
1336 if (ref == &ctx->rsrc_node->refs)
1337 ctx->rsrc_cached_refs++;
1338 else
1339 percpu_ref_put(ref);
1340 }
1341}
1342
1343static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
1344{
1345 if (req->fixed_rsrc_refs)
1346 percpu_ref_put(req->fixed_rsrc_refs);
1347}
1348
1349static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
1350 __must_hold(&ctx->uring_lock)
1351{
1352 if (ctx->rsrc_cached_refs) {
1353 percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
1354 ctx->rsrc_cached_refs = 0;
1355 }
1356}
1357
1358static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
1359 __must_hold(&ctx->uring_lock)
1360{
1361 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
1362 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
1363}
36f72fe2 1364
a46be971 1365static inline void io_req_set_rsrc_node(struct io_kiocb *req,
5106dd6e
JA
1366 struct io_ring_ctx *ctx,
1367 unsigned int issue_flags)
36f72fe2 1368{
269bbe5f 1369 if (!req->fixed_rsrc_refs) {
a7f0ed5a 1370 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
5106dd6e
JA
1371
1372 if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1373 lockdep_assert_held(&ctx->uring_lock);
1374 ctx->rsrc_cached_refs--;
1375 if (unlikely(ctx->rsrc_cached_refs < 0))
1376 io_rsrc_refs_refill(ctx);
1377 } else {
1378 percpu_ref_get(req->fixed_rsrc_refs);
1379 }
36f72fe2
PB
1380 }
1381}
1382
cc3cec83 1383static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
3648e526 1384{
d1fd1c20 1385 struct io_buffer *kbuf = req->kbuf;
3648e526
HX
1386 unsigned int cflags;
1387
cc3cec83 1388 cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
3648e526 1389 req->flags &= ~REQ_F_BUFFER_SELECTED;
cc3cec83 1390 list_add(&kbuf->list, list);
d1fd1c20 1391 req->kbuf = NULL;
3648e526
HX
1392 return cflags;
1393}
1394
cc3cec83 1395static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
3648e526 1396{
8197b053
PB
1397 lockdep_assert_held(&req->ctx->completion_lock);
1398
3648e526
HX
1399 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1400 return 0;
cc3cec83
JA
1401 return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
1402}
1403
1404static inline unsigned int io_put_kbuf(struct io_kiocb *req,
1405 unsigned issue_flags)
1406{
1407 unsigned int cflags;
1408
1409 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1410 return 0;
1411
1412 /*
1413 * We can add this buffer back to two lists:
1414 *
1415 * 1) The io_buffers_cache list. This one is protected by the
1416 * ctx->uring_lock. If we already hold this lock, add back to this
1417 * list as we can grab it from issue as well.
1418 * 2) The io_buffers_comp list. This one is protected by the
1419 * ctx->completion_lock.
1420 *
1421 * We migrate buffers from the comp_list to the issue cache list
1422 * when we need one.
1423 */
1424 if (issue_flags & IO_URING_F_UNLOCKED) {
1425 struct io_ring_ctx *ctx = req->ctx;
1426
1427 spin_lock(&ctx->completion_lock);
1428 cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
1429 spin_unlock(&ctx->completion_lock);
1430 } else {
ab0ac095
PB
1431 lockdep_assert_held(&req->ctx->uring_lock);
1432
cc3cec83
JA
1433 cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
1434 }
1435
1436 return cflags;
3648e526
HX
1437}
1438
dbc7d452
JA
1439static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
1440 unsigned int bgid)
1441{
1442 struct list_head *hash_list;
1443 struct io_buffer_list *bl;
1444
1445 hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
1446 list_for_each_entry(bl, hash_list, list)
1447 if (bl->bgid == bgid || bgid == -1U)
1448 return bl;
1449
1450 return NULL;
1451}
1452
4d55f238 1453static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
b1c62645
JA
1454{
1455 struct io_ring_ctx *ctx = req->ctx;
dbc7d452
JA
1456 struct io_buffer_list *bl;
1457 struct io_buffer *buf;
b1c62645
JA
1458
1459 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1460 return;
8a3e8ee5
JA
1461 /* don't recycle if we already did IO to this buffer */
1462 if (req->flags & REQ_F_PARTIAL_IO)
1463 return;
b1c62645 1464
f8929630 1465 io_ring_submit_lock(ctx, issue_flags);
b1c62645
JA
1466
1467 buf = req->kbuf;
dbc7d452
JA
1468 bl = io_buffer_get_list(ctx, buf->bgid);
1469 list_add(&buf->list, &bl->buf_list);
b1c62645
JA
1470 req->flags &= ~REQ_F_BUFFER_SELECTED;
1471 req->kbuf = NULL;
4d55f238 1472
f8929630 1473 io_ring_submit_unlock(ctx, issue_flags);
b1c62645
JA
1474}
1475
3dd0c97a
PB
1476static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1477 bool cancel_all)
6af3f48b 1478 __must_hold(&req->ctx->timeout_lock)
08d23634 1479{
68207680 1480 if (task && head->task != task)
08d23634 1481 return false;
d5361233 1482 return cancel_all;
6af3f48b
PB
1483}
1484
1485/*
1486 * As io_match_task() but protected against racing with linked timeouts.
1487 * User must not hold timeout_lock.
1488 */
1489static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
1490 bool cancel_all)
1491{
6af3f48b
PB
1492 if (task && head->task != task)
1493 return false;
d5361233 1494 return cancel_all;
6af3f48b
PB
1495}
1496
d886e185
PB
1497static inline bool req_has_async_data(struct io_kiocb *req)
1498{
1499 return req->flags & REQ_F_ASYNC_DATA;
1500}
1501
93d2bcd2 1502static inline void req_set_fail(struct io_kiocb *req)
c40f6379 1503{
93d2bcd2 1504 req->flags |= REQ_F_FAIL;
04c76b41
PB
1505 if (req->flags & REQ_F_CQE_SKIP) {
1506 req->flags &= ~REQ_F_CQE_SKIP;
1507 req->flags |= REQ_F_SKIP_LINK_CQES;
1508 }
c40f6379 1509}
4a38aed2 1510
a8295b98
HX
1511static inline void req_fail_link_node(struct io_kiocb *req, int res)
1512{
1513 req_set_fail(req);
cef216fc 1514 req->cqe.res = res;
a8295b98
HX
1515}
1516
fa05457a
PB
1517static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
1518{
1519 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
1520}
1521
c072481d 1522static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
2b188cc1
JA
1523{
1524 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1525
0f158b4c 1526 complete(&ctx->ref_comp);
2b188cc1
JA
1527}
1528
8eb7e2d0
PB
1529static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1530{
1531 return !req->timeout.off;
1532}
1533
c072481d 1534static __cold void io_fallback_req_func(struct work_struct *work)
f56165e6
PB
1535{
1536 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1537 fallback_work.work);
1538 struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1539 struct io_kiocb *req, *tmp;
f237c30a 1540 bool locked = false;
f56165e6
PB
1541
1542 percpu_ref_get(&ctx->refs);
1543 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
f237c30a 1544 req->io_task_work.func(req, &locked);
5636c00d 1545
f237c30a 1546 if (locked) {
c450178d 1547 io_submit_flush_completions(ctx);
f237c30a
PB
1548 mutex_unlock(&ctx->uring_lock);
1549 }
f56165e6
PB
1550 percpu_ref_put(&ctx->refs);
1551}
1552
c072481d 1553static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
2b188cc1
JA
1554{
1555 struct io_ring_ctx *ctx;
dbc7d452 1556 int i, hash_bits;
2b188cc1
JA
1557
1558 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1559 if (!ctx)
1560 return NULL;
1561
78076bb6
JA
1562 /*
1563 * Use 5 bits less than the max cq entries, that should give us around
1564 * 32 entries per hash list if totally full and uniformly spread.
1565 */
1566 hash_bits = ilog2(p->cq_entries);
1567 hash_bits -= 5;
1568 if (hash_bits <= 0)
1569 hash_bits = 1;
1570 ctx->cancel_hash_bits = hash_bits;
1571 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1572 GFP_KERNEL);
1573 if (!ctx->cancel_hash)
1574 goto err;
1575 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1576
6224843d
PB
1577 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1578 if (!ctx->dummy_ubuf)
1579 goto err;
1580 /* set invalid range, so io_import_fixed() fails meeting it */
1581 ctx->dummy_ubuf->ubuf = -1UL;
1582
dbc7d452
JA
1583 ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
1584 sizeof(struct list_head), GFP_KERNEL);
1585 if (!ctx->io_buffers)
1586 goto err;
1587 for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
1588 INIT_LIST_HEAD(&ctx->io_buffers[i]);
1589
21482896 1590 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
1591 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1592 goto err;
2b188cc1
JA
1593
1594 ctx->flags = p->flags;
90554200 1595 init_waitqueue_head(&ctx->sqo_sq_wait);
69fb2131 1596 INIT_LIST_HEAD(&ctx->sqd_list);
1d7bb1d5 1597 INIT_LIST_HEAD(&ctx->cq_overflow_list);
cc3cec83 1598 INIT_LIST_HEAD(&ctx->io_buffers_cache);
4d9237e3 1599 INIT_LIST_HEAD(&ctx->apoll_cache);
0f158b4c 1600 init_completion(&ctx->ref_comp);
61cf9370 1601 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
2b188cc1 1602 mutex_init(&ctx->uring_lock);
311997b3 1603 init_waitqueue_head(&ctx->cq_wait);
2b188cc1 1604 spin_lock_init(&ctx->completion_lock);
89850fce 1605 spin_lock_init(&ctx->timeout_lock);
5eef4e87 1606 INIT_WQ_LIST(&ctx->iopoll_list);
cc3cec83
JA
1607 INIT_LIST_HEAD(&ctx->io_buffers_pages);
1608 INIT_LIST_HEAD(&ctx->io_buffers_comp);
de0617e4 1609 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 1610 INIT_LIST_HEAD(&ctx->timeout_list);
ef9dd637 1611 INIT_LIST_HEAD(&ctx->ltimeout_list);
d67d2263
BM
1612 spin_lock_init(&ctx->rsrc_ref_lock);
1613 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
269bbe5f
BM
1614 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1615 init_llist_head(&ctx->rsrc_put_llist);
13bf43f5 1616 INIT_LIST_HEAD(&ctx->tctx_list);
c2b6c6bc
PB
1617 ctx->submit_state.free_list.next = NULL;
1618 INIT_WQ_LIST(&ctx->locked_free_list);
9011bf9a 1619 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
6f33b0bc 1620 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
2b188cc1 1621 return ctx;
206aefde 1622err:
6224843d 1623 kfree(ctx->dummy_ubuf);
78076bb6 1624 kfree(ctx->cancel_hash);
dbc7d452 1625 kfree(ctx->io_buffers);
206aefde
JA
1626 kfree(ctx);
1627 return NULL;
2b188cc1
JA
1628}
1629
8f6ed49a
PB
1630static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1631{
1632 struct io_rings *r = ctx->rings;
1633
1634 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1635 ctx->cq_extra--;
1636}
1637
9cf7c104 1638static bool req_need_defer(struct io_kiocb *req, u32 seq)
7adf4eaf 1639{
2bc9930e
JA
1640 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1641 struct io_ring_ctx *ctx = req->ctx;
a197f664 1642
8f6ed49a 1643 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
2bc9930e 1644 }
de0617e4 1645
9d858b21 1646 return false;
de0617e4
JA
1647}
1648
35645ac3
PB
1649#define FFS_NOWAIT 0x1UL
1650#define FFS_ISREG 0x2UL
1651#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
c97d8a0f
PB
1652
1653static inline bool io_req_ffs_set(struct io_kiocb *req)
1654{
35645ac3 1655 return req->flags & REQ_F_FIXED_FILE;
c97d8a0f
PB
1656}
1657
fd08e530
PB
1658static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1659{
906c6caa
PB
1660 if (WARN_ON_ONCE(!req->link))
1661 return NULL;
1662
4d13d1a4
PB
1663 req->flags &= ~REQ_F_ARM_LTIMEOUT;
1664 req->flags |= REQ_F_LINK_TIMEOUT;
fd08e530
PB
1665
1666 /* linked timeouts should have two refs once prep'ed */
48dcd38d 1667 io_req_set_refcount(req);
4d13d1a4
PB
1668 __io_req_set_refcount(req->link, 2);
1669 return req->link;
fd08e530
PB
1670}
1671
1672static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1673{
4d13d1a4 1674 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
fd08e530
PB
1675 return NULL;
1676 return __io_prep_linked_timeout(req);
1677}
1678
cb2d344c
PB
1679static noinline void __io_arm_ltimeout(struct io_kiocb *req)
1680{
1681 io_queue_linked_timeout(__io_prep_linked_timeout(req));
1682}
1683
1684static inline void io_arm_ltimeout(struct io_kiocb *req)
1685{
1686 if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
1687 __io_arm_ltimeout(req);
1688}
1689
1e6fa521
JA
1690static void io_prep_async_work(struct io_kiocb *req)
1691{
1692 const struct io_op_def *def = &io_op_defs[req->opcode];
1e6fa521
JA
1693 struct io_ring_ctx *ctx = req->ctx;
1694
b8e64b53
PB
1695 if (!(req->flags & REQ_F_CREDS)) {
1696 req->flags |= REQ_F_CREDS;
c10d1f98 1697 req->creds = get_current_cred();
b8e64b53 1698 }
003e8dcc 1699
e1d675df
PB
1700 req->work.list.next = NULL;
1701 req->work.flags = 0;
feaadc4f
PB
1702 if (req->flags & REQ_F_FORCE_ASYNC)
1703 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1704
1e6fa521
JA
1705 if (req->flags & REQ_F_ISREG) {
1706 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1707 io_wq_hash_work(&req->work, file_inode(req->file));
4b982bd0 1708 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1e6fa521
JA
1709 if (def->unbound_nonreg_file)
1710 req->work.flags |= IO_WQ_WORK_UNBOUND;
1711 }
561fb04a 1712}
cccf0ee8 1713
cbdcb435 1714static void io_prep_async_link(struct io_kiocb *req)
561fb04a 1715{
cbdcb435 1716 struct io_kiocb *cur;
54a91f3b 1717
44eff40a
PB
1718 if (req->flags & REQ_F_LINK_TIMEOUT) {
1719 struct io_ring_ctx *ctx = req->ctx;
1720
674ee8e1 1721 spin_lock_irq(&ctx->timeout_lock);
44eff40a
PB
1722 io_for_each_link(cur, req)
1723 io_prep_async_work(cur);
674ee8e1 1724 spin_unlock_irq(&ctx->timeout_lock);
44eff40a
PB
1725 } else {
1726 io_for_each_link(cur, req)
1727 io_prep_async_work(cur);
1728 }
561fb04a
JA
1729}
1730
fff4e40e
PB
1731static inline void io_req_add_compl_list(struct io_kiocb *req)
1732{
775a1f2f 1733 struct io_submit_state *state = &req->ctx->submit_state;
fff4e40e 1734
3d4aeb9f 1735 if (!(req->flags & REQ_F_CQE_SKIP))
775a1f2f 1736 state->flush_cqes = true;
fff4e40e
PB
1737 wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1738}
1739
77955efb 1740static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
561fb04a 1741{
cbdcb435 1742 struct io_kiocb *link = io_prep_linked_timeout(req);
5aa75ed5 1743 struct io_uring_task *tctx = req->task->io_uring;
561fb04a 1744
3bfe6106
JA
1745 BUG_ON(!tctx);
1746 BUG_ON(!tctx->io_wq);
561fb04a 1747
cbdcb435
PB
1748 /* init ->work of the whole link before punting */
1749 io_prep_async_link(req);
991468dc
JA
1750
1751 /*
1752 * Not expected to happen, but if we do have a bug where this _can_
1753 * happen, catch it here and ensure the request is marked as
1754 * canceled. That will make io-wq go through the usual work cancel
1755 * procedure rather than attempt to run this request (or create a new
1756 * worker for it).
1757 */
1758 if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1759 req->work.flags |= IO_WQ_WORK_CANCEL;
1760
971cf9c1
PB
1761 trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
1762 req->opcode, req->flags, &req->work,
1763 io_wq_is_hashed(&req->work));
ebf93667 1764 io_wq_enqueue(tctx->io_wq, &req->work);
7271ef3a
JA
1765 if (link)
1766 io_queue_linked_timeout(link);
cbdcb435
PB
1767}
1768
1ee4160c 1769static void io_kill_timeout(struct io_kiocb *req, int status)
8c855885 1770 __must_hold(&req->ctx->completion_lock)
89850fce 1771 __must_hold(&req->ctx->timeout_lock)
5262f567 1772{
e8c2bc1f 1773 struct io_timeout_data *io = req->async_data;
5262f567 1774
fd9c7bc5 1775 if (hrtimer_try_to_cancel(&io->timer) != -1) {
2ae2eb9d
PB
1776 if (status)
1777 req_set_fail(req);
01cec8c1
PB
1778 atomic_set(&req->ctx->cq_timeouts,
1779 atomic_read(&req->ctx->cq_timeouts) + 1);
135fcde8 1780 list_del_init(&req->timeout.list);
4e118cd9 1781 io_req_tw_post_queue(req, status, 0);
5262f567
JA
1782 }
1783}
1784
c072481d 1785static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
de0617e4 1786{
441b8a78 1787 while (!list_empty(&ctx->defer_list)) {
27dc8338
PB
1788 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1789 struct io_defer_entry, list);
de0617e4 1790
9cf7c104 1791 if (req_need_defer(de->req, de->seq))
04518945 1792 break;
27dc8338 1793 list_del_init(&de->list);
907d1df3 1794 io_req_task_queue(de->req);
27dc8338 1795 kfree(de);
441b8a78 1796 }
04518945
PB
1797}
1798
c072481d 1799static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
89850fce 1800 __must_hold(&ctx->completion_lock)
de0617e4 1801{
441b8a78 1802 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
e677edbc 1803 struct io_kiocb *req, *tmp;
f010505b 1804
79ebeaee 1805 spin_lock_irq(&ctx->timeout_lock);
e677edbc 1806 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
f010505b 1807 u32 events_needed, events_got;
de0617e4 1808
8eb7e2d0 1809 if (io_is_timeout_noseq(req))
360428f8 1810 break;
f010505b
MDG
1811
1812 /*
1813 * Since seq can easily wrap around over time, subtract
1814 * the last seq at which timeouts were flushed before comparing.
1815 * Assuming not more than 2^31-1 events have happened since,
1816 * these subtractions won't have wrapped, so we can check if
1817 * target is in [last_seq, current_seq] by comparing the two.
1818 */
1819 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1820 events_got = seq - ctx->cq_last_tm_flush;
1821 if (events_got < events_needed)
360428f8 1822 break;
bfe68a22 1823
1ee4160c 1824 io_kill_timeout(req, 0);
f18ee4cf 1825 }
f010505b 1826 ctx->cq_last_tm_flush = seq;
79ebeaee 1827 spin_unlock_irq(&ctx->timeout_lock);
360428f8 1828}
5262f567 1829
9333f6b4
PB
1830static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1831{
1832 /* order cqe stores with ring update */
1833 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1834}
1835
9aa8dfde 1836static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
360428f8 1837{
9aa8dfde
PB
1838 if (ctx->off_timeout_used || ctx->drain_active) {
1839 spin_lock(&ctx->completion_lock);
1840 if (ctx->off_timeout_used)
1841 io_flush_timeouts(ctx);
1842 if (ctx->drain_active)
1843 io_queue_deferred(ctx);
1844 io_commit_cqring(ctx);
1845 spin_unlock(&ctx->completion_lock);
1846 }
1847 if (ctx->has_evfd)
1848 io_eventfd_signal(ctx);
de0617e4
JA
1849}
1850
90554200
JA
1851static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1852{
1853 struct io_rings *r = ctx->rings;
1854
a566c556 1855 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
90554200
JA
1856}
1857
888aae2e
PB
1858static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1859{
1860 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1861}
1862
d8da428b
PB
1863/*
1864 * writes to the cq entry need to come after reading head; the
1865 * control dependency is enough as we're using WRITE_ONCE to
1866 * fill the cq entry
1867 */
1868static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
2b188cc1 1869{
75b28aff 1870 struct io_rings *rings = ctx->rings;
d8da428b
PB
1871 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
1872 unsigned int free, queued, len;
1873
1874 /* userspace may cheat modifying the tail, be safe and do min */
1875 queued = min(__io_cqring_events(ctx), ctx->cq_entries);
1876 free = ctx->cq_entries - queued;
1877 /* we need a contiguous range, limit based on the current array offset */
1878 len = min(free, ctx->cq_entries - off);
1879 if (!len)
2b188cc1
JA
1880 return NULL;
1881
d8da428b
PB
1882 ctx->cached_cq_tail++;
1883 ctx->cqe_cached = &rings->cqes[off];
1884 ctx->cqe_sentinel = ctx->cqe_cached + len;
1885 return ctx->cqe_cached++;
1886}
1887
1888static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1889{
1890 if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
1891 ctx->cached_cq_tail++;
1892 return ctx->cqe_cached++;
1893 }
1894 return __io_get_cqe(ctx);
2b188cc1
JA
1895}
1896
77bc59b4 1897static void io_eventfd_signal(struct io_ring_ctx *ctx)
f2842ab5 1898{
77bc59b4
UA
1899 struct io_ev_fd *ev_fd;
1900
77bc59b4
UA
1901 rcu_read_lock();
1902 /*
1903 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
1904 * and eventfd_signal
1905 */
1906 ev_fd = rcu_dereference(ctx->io_ev_fd);
1907
1908 /*
1909 * Check again if ev_fd exists incase an io_eventfd_unregister call
1910 * completed between the NULL check of ctx->io_ev_fd at the start of
1911 * the function and rcu_read_lock.
1912 */
1913 if (unlikely(!ev_fd))
1914 goto out;
7e55a19c 1915 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
77bc59b4
UA
1916 goto out;
1917
c75312dd 1918 if (!ev_fd->eventfd_async || io_wq_current_is_worker())
77bc59b4 1919 eventfd_signal(ev_fd->cq_ev_fd, 1);
77bc59b4
UA
1920out:
1921 rcu_read_unlock();
f2842ab5
JA
1922}
1923
9aa8dfde
PB
1924static inline void io_cqring_wake(struct io_ring_ctx *ctx)
1925{
1926 /*
1927 * wake_up_all() may seem excessive, but io_wake_function() and
1928 * io_should_wake() handle the termination of the loop and only
1929 * wake as many waiters as we need to.
1930 */
1931 if (wq_has_sleeper(&ctx->cq_wait))
1932 wake_up_all(&ctx->cq_wait);
1933}
1934
2c5d763c
JA
1935/*
1936 * This should only get called when at least one event has been posted.
1937 * Some applications rely on the eventfd notification count only changing
1938 * IFF a new CQE has been added to the CQ ring. There's no depedency on
1939 * 1:1 relationship between how many times this function is called (and
1940 * hence the eventfd count) and number of CQEs posted to the CQ ring.
1941 */
66fc25ca 1942static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5 1943{
9aa8dfde
PB
1944 if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
1945 ctx->has_evfd))
9333f6b4
PB
1946 __io_commit_cqring_flush(ctx);
1947
9aa8dfde 1948 io_cqring_wake(ctx);
1d7bb1d5
JA
1949}
1950
80c18e4a
PB
1951static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1952{
9aa8dfde
PB
1953 if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
1954 ctx->has_evfd))
9333f6b4
PB
1955 __io_commit_cqring_flush(ctx);
1956
9aa8dfde
PB
1957 if (ctx->flags & IORING_SETUP_SQPOLL)
1958 io_cqring_wake(ctx);
80c18e4a
PB
1959}
1960
c4a2ed72 1961/* Returns true if there are no backlogged entries after the flush */
6c2450ae 1962static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5 1963{
b18032bb 1964 bool all_flushed, posted;
1d7bb1d5 1965
a566c556 1966 if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
e23de15f 1967 return false;
1d7bb1d5 1968
b18032bb 1969 posted = false;
79ebeaee 1970 spin_lock(&ctx->completion_lock);
6c2450ae 1971 while (!list_empty(&ctx->cq_overflow_list)) {
d068b506 1972 struct io_uring_cqe *cqe = io_get_cqe(ctx);
6c2450ae 1973 struct io_overflow_cqe *ocqe;
e6c8aa9a 1974
1d7bb1d5
JA
1975 if (!cqe && !force)
1976 break;
6c2450ae
PB
1977 ocqe = list_first_entry(&ctx->cq_overflow_list,
1978 struct io_overflow_cqe, list);
1979 if (cqe)
1980 memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1981 else
8f6ed49a
PB
1982 io_account_cq_overflow(ctx);
1983
b18032bb 1984 posted = true;
6c2450ae
PB
1985 list_del(&ocqe->list);
1986 kfree(ocqe);
1d7bb1d5
JA
1987 }
1988
09e88404
PB
1989 all_flushed = list_empty(&ctx->cq_overflow_list);
1990 if (all_flushed) {
5ed7a37d 1991 clear_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
1992 WRITE_ONCE(ctx->rings->sq_flags,
1993 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
09e88404 1994 }
46930143 1995
60053be8 1996 io_commit_cqring(ctx);
79ebeaee 1997 spin_unlock(&ctx->completion_lock);
b18032bb
JA
1998 if (posted)
1999 io_cqring_ev_posted(ctx);
09e88404 2000 return all_flushed;
1d7bb1d5
JA
2001}
2002
90f67366 2003static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
6c503150 2004{
ca0a2651
JA
2005 bool ret = true;
2006
5ed7a37d 2007 if (test_bit(0, &ctx->check_cq_overflow)) {
6c503150
PB
2008 /* iopoll syncs against uring_lock, not completion_lock */
2009 if (ctx->flags & IORING_SETUP_IOPOLL)
2010 mutex_lock(&ctx->uring_lock);
90f67366 2011 ret = __io_cqring_overflow_flush(ctx, false);
6c503150
PB
2012 if (ctx->flags & IORING_SETUP_IOPOLL)
2013 mutex_unlock(&ctx->uring_lock);
2014 }
ca0a2651
JA
2015
2016 return ret;
6c503150
PB
2017}
2018
9d170164 2019static void __io_put_task(struct task_struct *task, int nr)
6a290a14
PB
2020{
2021 struct io_uring_task *tctx = task->io_uring;
2022
9d170164
PB
2023 percpu_counter_sub(&tctx->inflight, nr);
2024 if (unlikely(atomic_read(&tctx->in_idle)))
2025 wake_up(&tctx->wait);
2026 put_task_struct_many(task, nr);
2027}
2028
2029/* must to be called somewhat shortly after putting a request */
2030static inline void io_put_task(struct task_struct *task, int nr)
2031{
2032 if (likely(task == current))
2033 task->io_uring->cached_refs += nr;
2034 else
2035 __io_put_task(task, nr);
6a290a14
PB
2036}
2037
9a10867a
PB
2038static void io_task_refs_refill(struct io_uring_task *tctx)
2039{
2040 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
2041
2042 percpu_counter_add(&tctx->inflight, refill);
2043 refcount_add(refill, &current->usage);
2044 tctx->cached_refs += refill;
2045}
2046
2047static inline void io_get_task_refs(int nr)
2048{
2049 struct io_uring_task *tctx = current->io_uring;
2050
2051 tctx->cached_refs -= nr;
2052 if (unlikely(tctx->cached_refs < 0))
2053 io_task_refs_refill(tctx);
2054}
2055
3cc7fdb9
PB
2056static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
2057{
2058 struct io_uring_task *tctx = task->io_uring;
2059 unsigned int refs = tctx->cached_refs;
2060
2061 if (refs) {
2062 tctx->cached_refs = 0;
2063 percpu_counter_sub(&tctx->inflight, refs);
2064 put_task_struct_many(task, refs);
2065 }
2066}
2067
d4d19c19 2068static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
54daa9b2 2069 s32 res, u32 cflags)
2b188cc1 2070{
cce4b8b0 2071 struct io_overflow_cqe *ocqe;
2b188cc1 2072
cce4b8b0
PB
2073 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
2074 if (!ocqe) {
2075 /*
2076 * If we're in ring overflow flush mode, or in task cancel mode,
2077 * or cannot allocate an overflow entry, then we need to drop it
2078 * on the floor.
2079 */
8f6ed49a 2080 io_account_cq_overflow(ctx);
cce4b8b0 2081 return false;
2b188cc1 2082 }
cce4b8b0 2083 if (list_empty(&ctx->cq_overflow_list)) {
5ed7a37d 2084 set_bit(0, &ctx->check_cq_overflow);
20c0b380
NA
2085 WRITE_ONCE(ctx->rings->sq_flags,
2086 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
2087
cce4b8b0 2088 }
d4d19c19 2089 ocqe->cqe.user_data = user_data;
cce4b8b0
PB
2090 ocqe->cqe.res = res;
2091 ocqe->cqe.flags = cflags;
2092 list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
2093 return true;
2b188cc1
JA
2094}
2095
ae4da189 2096static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
913a571a 2097 s32 res, u32 cflags)
2b188cc1
JA
2098{
2099 struct io_uring_cqe *cqe;
2100
2101 /*
2102 * If we can't get a cq entry, userspace overflowed the
2103 * submission (by quite a lot). Increment the overflow count in
2104 * the ring.
2105 */
d068b506 2106 cqe = io_get_cqe(ctx);
1d7bb1d5 2107 if (likely(cqe)) {
d4d19c19 2108 WRITE_ONCE(cqe->user_data, user_data);
2b188cc1 2109 WRITE_ONCE(cqe->res, res);
bcda7baa 2110 WRITE_ONCE(cqe->flags, cflags);
8d13326e 2111 return true;
2b188cc1 2112 }
d4d19c19 2113 return io_cqring_event_overflow(ctx, user_data, res, cflags);
2b188cc1
JA
2114}
2115
90e7c35f
PB
2116static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
2117 struct io_kiocb *req)
2118{
2119 struct io_uring_cqe *cqe;
2120
2121 trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
2122 req->cqe.res, req->cqe.flags);
2123
2124 /*
2125 * If we can't get a cq entry, userspace overflowed the
2126 * submission (by quite a lot). Increment the overflow count in
2127 * the ring.
2128 */
2129 cqe = io_get_cqe(ctx);
2130 if (likely(cqe)) {
2131 memcpy(cqe, &req->cqe, sizeof(*cqe));
2132 return true;
2133 }
2134 return io_cqring_event_overflow(ctx, req->cqe.user_data,
2135 req->cqe.res, req->cqe.flags);
2136}
2137
ae4da189 2138static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
d5ec1dfa 2139{
cef216fc
PB
2140 trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
2141 return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
d5ec1dfa
SR
2142}
2143
913a571a
PB
2144static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
2145 s32 res, u32 cflags)
bcda7baa 2146{
913a571a 2147 ctx->cq_extra++;
502c87d6 2148 trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
ae4da189 2149 return __io_fill_cqe(ctx, user_data, res, cflags);
bcda7baa
JA
2150}
2151
a37fae8a
HX
2152static void __io_req_complete_post(struct io_kiocb *req, s32 res,
2153 u32 cflags)
2b188cc1 2154{
78e19bbe 2155 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 2156
04c76b41 2157 if (!(req->flags & REQ_F_CQE_SKIP))
ae4da189 2158 __io_fill_cqe_req(req, res, cflags);
c7dae4ba
JA
2159 /*
2160 * If we're the last reference to this request, add to our locked
2161 * free_list cache.
2162 */
de9b4cca 2163 if (req_ref_put_and_test(req)) {
da1a08c5 2164 if (req->flags & IO_REQ_LINK_FLAGS) {
0756a869 2165 if (req->flags & IO_DISARM_MASK)
7a612350
PB
2166 io_disarm_next(req);
2167 if (req->link) {
2168 io_req_task_queue(req->link);
2169 req->link = NULL;
2170 }
2171 }
ab409402 2172 io_req_put_rsrc(req, ctx);
8197b053
PB
2173 /*
2174 * Selected buffer deallocation in io_clean_op() assumes that
2175 * we don't hold ->completion_lock. Clean them here to avoid
2176 * deadlocks.
2177 */
2178 io_put_kbuf_comp(req);
c7dae4ba
JA
2179 io_dismantle_req(req);
2180 io_put_task(req->task, 1);
c2b6c6bc 2181 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
d0acdee2 2182 ctx->locked_free_nr++;
180f829f 2183 }
a37fae8a
HX
2184}
2185
2186static void io_req_complete_post(struct io_kiocb *req, s32 res,
2187 u32 cflags)
2188{
2189 struct io_ring_ctx *ctx = req->ctx;
2190
2191 spin_lock(&ctx->completion_lock);
2192 __io_req_complete_post(req, res, cflags);
7a612350 2193 io_commit_cqring(ctx);
79ebeaee 2194 spin_unlock(&ctx->completion_lock);
a3f34907 2195 io_cqring_ev_posted(ctx);
4e3d9ff9
JA
2196}
2197
54daa9b2
PB
2198static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
2199 u32 cflags)
229a7b63 2200{
cef216fc
PB
2201 req->cqe.res = res;
2202 req->cqe.flags = cflags;
e342c807 2203 req->flags |= REQ_F_COMPLETE_INLINE;
e1e16097
JA
2204}
2205
889fca73 2206static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
54daa9b2 2207 s32 res, u32 cflags)
bcda7baa 2208{
889fca73
PB
2209 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
2210 io_req_complete_state(req, res, cflags);
a38d68db 2211 else
c7dae4ba 2212 io_req_complete_post(req, res, cflags);
bcda7baa
JA
2213}
2214
54daa9b2 2215static inline void io_req_complete(struct io_kiocb *req, s32 res)
0ddf92e8 2216{
889fca73 2217 __io_req_complete(req, 0, res, 0);
0ddf92e8
JA
2218}
2219
54daa9b2 2220static void io_req_complete_failed(struct io_kiocb *req, s32 res)
f41db273 2221{
93d2bcd2 2222 req_set_fail(req);
ab0ac095 2223 io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
f41db273
PB
2224}
2225
864ea921
PB
2226/*
2227 * Don't initialise the fields below on every allocation, but do that in
2228 * advance and keep them valid across allocations.
2229 */
2230static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
2231{
2232 req->ctx = ctx;
2233 req->link = NULL;
2234 req->async_data = NULL;
2235 /* not necessary, but safer to zero */
cef216fc 2236 req->cqe.res = 0;
864ea921
PB
2237}
2238
dac7a098 2239static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
cd0ca2e0 2240 struct io_submit_state *state)
dac7a098 2241{
79ebeaee 2242 spin_lock(&ctx->completion_lock);
c2b6c6bc 2243 wq_list_splice(&ctx->locked_free_list, &state->free_list);
d0acdee2 2244 ctx->locked_free_nr = 0;
79ebeaee 2245 spin_unlock(&ctx->completion_lock);
dac7a098
PB
2246}
2247
88ab95be
PB
2248static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
2249{
2250 return !ctx->submit_state.free_list.next;
2251}
2252
5d5901a3
PB
2253/*
2254 * A request might get retired back into the request caches even before opcode
2255 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
2256 * Because of that, io_alloc_req() should be called only under ->uring_lock
2257 * and with extra caution to not get a request that is still worked on.
2258 */
c072481d 2259static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
5d5901a3 2260 __must_hold(&ctx->uring_lock)
2b188cc1 2261{
864ea921 2262 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
3ab665b7 2263 void *reqs[IO_REQ_ALLOC_BATCH];
864ea921 2264 int ret, i;
e5d1bc0a 2265
23a5c43b
PB
2266 /*
2267 * If we have more than a batch's worth of requests in our IRQ side
2268 * locked cache, grab the lock and move them over to our submission
2269 * side cache.
2270 */
a6d97a8a 2271 if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
23a5c43b 2272 io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
88ab95be 2273 if (!io_req_cache_empty(ctx))
23a5c43b
PB
2274 return true;
2275 }
e5d1bc0a 2276
3ab665b7 2277 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
fd6fab2c 2278
864ea921
PB
2279 /*
2280 * Bulk alloc is all-or-nothing. If we fail to get a batch,
2281 * retry single alloc to be on the safe side.
2282 */
2283 if (unlikely(ret <= 0)) {
3ab665b7
PB
2284 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2285 if (!reqs[0])
a33ae9ce 2286 return false;
864ea921 2287 ret = 1;
2b188cc1 2288 }
864ea921 2289
37f0e767 2290 percpu_ref_get_many(&ctx->refs, ret);
3ab665b7 2291 for (i = 0; i < ret; i++) {
23a5c43b 2292 struct io_kiocb *req = reqs[i];
3ab665b7
PB
2293
2294 io_preinit_req(req, ctx);
fa05457a 2295 io_req_add_to_cache(req, ctx);
3ab665b7 2296 }
a33ae9ce
PB
2297 return true;
2298}
2299
2300static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
2301{
88ab95be 2302 if (unlikely(io_req_cache_empty(ctx)))
a33ae9ce
PB
2303 return __io_alloc_req_refill(ctx);
2304 return true;
2305}
2306
2307static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2308{
2309 struct io_wq_work_node *node;
2310
2311 node = wq_stack_extract(&ctx->submit_state.free_list);
c2b6c6bc 2312 return container_of(node, struct io_kiocb, comp_list);
2b188cc1
JA
2313}
2314
e1d767f0 2315static inline void io_put_file(struct file *file)
8da11c19 2316{
e1d767f0 2317 if (file)
8da11c19
PB
2318 fput(file);
2319}
2320
6b639522 2321static inline void io_dismantle_req(struct io_kiocb *req)
2b188cc1 2322{
094bae49 2323 unsigned int flags = req->flags;
929a3af9 2324
867f8fa5 2325 if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
3a0a6902 2326 io_clean_op(req);
e1d767f0
PB
2327 if (!(flags & REQ_F_FIXED_FILE))
2328 io_put_file(req->file);
e65ef56d
JA
2329}
2330
f5c6cf2a 2331static __cold void io_free_req(struct io_kiocb *req)
c6ca97b3 2332{
51a4cc11 2333 struct io_ring_ctx *ctx = req->ctx;
c6ca97b3 2334
ab409402 2335 io_req_put_rsrc(req, ctx);
216578e5 2336 io_dismantle_req(req);
7c660731 2337 io_put_task(req->task, 1);
c6ca97b3 2338
79ebeaee 2339 spin_lock(&ctx->completion_lock);
c2b6c6bc 2340 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
c34b025f 2341 ctx->locked_free_nr++;
79ebeaee 2342 spin_unlock(&ctx->completion_lock);
e65ef56d
JA
2343}
2344
f2f87370
PB
2345static inline void io_remove_next_linked(struct io_kiocb *req)
2346{
2347 struct io_kiocb *nxt = req->link;
2348
2349 req->link = nxt->link;
2350 nxt->link = NULL;
2351}
2352
33cc89a9
PB
2353static bool io_kill_linked_timeout(struct io_kiocb *req)
2354 __must_hold(&req->ctx->completion_lock)
89b263f6 2355 __must_hold(&req->ctx->timeout_lock)
2665abfd 2356{
33cc89a9 2357 struct io_kiocb *link = req->link;
f2f87370 2358
b97e736a 2359 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
c9abd7ad 2360 struct io_timeout_data *io = link->async_data;
7c86ffee 2361
f2f87370 2362 io_remove_next_linked(req);
90cd7e42 2363 link->timeout.head = NULL;
fd9c7bc5 2364 if (hrtimer_try_to_cancel(&io->timer) != -1) {
ef9dd637 2365 list_del(&link->timeout.list);
4e118cd9 2366 io_req_tw_post_queue(link, -ECANCELED, 0);
d4729fbd 2367 return true;
c9abd7ad
PB
2368 }
2369 }
d4729fbd 2370 return false;
7c86ffee
PB
2371}
2372
d148ca4b 2373static void io_fail_links(struct io_kiocb *req)
33cc89a9 2374 __must_hold(&req->ctx->completion_lock)
9e645e11 2375{
33cc89a9 2376 struct io_kiocb *nxt, *link = req->link;
04c76b41 2377 bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
9e645e11 2378
f2f87370 2379 req->link = NULL;
f2f87370 2380 while (link) {
a8295b98
HX
2381 long res = -ECANCELED;
2382
2383 if (link->flags & REQ_F_FAIL)
cef216fc 2384 res = link->cqe.res;
a8295b98 2385
f2f87370
PB
2386 nxt = link->link;
2387 link->link = NULL;
2665abfd 2388
cef216fc 2389 trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
502c87d6
SR
2390 req->opcode, link);
2391
4e118cd9
PB
2392 if (ignore_cqes)
2393 link->flags |= REQ_F_CQE_SKIP;
2394 else
04c76b41 2395 link->flags &= ~REQ_F_CQE_SKIP;
4e118cd9 2396 __io_req_complete_post(link, res, 0);
f2f87370 2397 link = nxt;
9e645e11 2398 }
33cc89a9 2399}
9e645e11 2400
33cc89a9
PB
2401static bool io_disarm_next(struct io_kiocb *req)
2402 __must_hold(&req->ctx->completion_lock)
2403{
2404 bool posted = false;
2405
0756a869
PB
2406 if (req->flags & REQ_F_ARM_LTIMEOUT) {
2407 struct io_kiocb *link = req->link;
2408
906c6caa 2409 req->flags &= ~REQ_F_ARM_LTIMEOUT;
0756a869
PB
2410 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2411 io_remove_next_linked(req);
4e118cd9 2412 io_req_tw_post_queue(link, -ECANCELED, 0);
0756a869
PB
2413 posted = true;
2414 }
2415 } else if (req->flags & REQ_F_LINK_TIMEOUT) {
89b263f6
JA
2416 struct io_ring_ctx *ctx = req->ctx;
2417
2418 spin_lock_irq(&ctx->timeout_lock);
33cc89a9 2419 posted = io_kill_linked_timeout(req);
89b263f6
JA
2420 spin_unlock_irq(&ctx->timeout_lock);
2421 }
93d2bcd2 2422 if (unlikely((req->flags & REQ_F_FAIL) &&
e4335ed3 2423 !(req->flags & REQ_F_HARDLINK))) {
33cc89a9
PB
2424 posted |= (req->link != NULL);
2425 io_fail_links(req);
2426 }
2427 return posted;
9e645e11
JA
2428}
2429
d81499bf
PB
2430static void __io_req_find_next_prep(struct io_kiocb *req)
2431{
2432 struct io_ring_ctx *ctx = req->ctx;
2433 bool posted;
2434
2435 spin_lock(&ctx->completion_lock);
2436 posted = io_disarm_next(req);
60053be8 2437 io_commit_cqring(ctx);
d81499bf
PB
2438 spin_unlock(&ctx->completion_lock);
2439 if (posted)
2440 io_cqring_ev_posted(ctx);
2441}
2442
2443static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
c69f8dbe 2444{
33cc89a9 2445 struct io_kiocb *nxt;
944e58bf 2446
9e645e11
JA
2447 /*
2448 * If LINK is set, we have dependent requests in this chain. If we
2449 * didn't fail this request, queue the first one up, moving any other
2450 * dependencies to the next request. In case of failure, fail the rest
2451 * of the chain.
2452 */
d81499bf
PB
2453 if (unlikely(req->flags & IO_DISARM_MASK))
2454 __io_req_find_next_prep(req);
33cc89a9
PB
2455 nxt = req->link;
2456 req->link = NULL;
2457 return nxt;
4d7dd462 2458}
9e645e11 2459
f237c30a 2460static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2c32395d
PB
2461{
2462 if (!ctx)
2463 return;
f237c30a 2464 if (*locked) {
c450178d 2465 io_submit_flush_completions(ctx);
2c32395d 2466 mutex_unlock(&ctx->uring_lock);
f237c30a 2467 *locked = false;
2c32395d
PB
2468 }
2469 percpu_ref_put(&ctx->refs);
2470}
2471
f28c240e
HX
2472static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
2473{
2474 io_commit_cqring(ctx);
2475 spin_unlock(&ctx->completion_lock);
2476 io_cqring_ev_posted(ctx);
2477}
2478
2479static void handle_prev_tw_list(struct io_wq_work_node *node,
2480 struct io_ring_ctx **ctx, bool *uring_locked)
2481{
2482 if (*ctx && !*uring_locked)
2483 spin_lock(&(*ctx)->completion_lock);
2484
2485 do {
2486 struct io_wq_work_node *next = node->next;
2487 struct io_kiocb *req = container_of(node, struct io_kiocb,
2488 io_task_work.node);
2489
34d2bfe7
JA
2490 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2491
f28c240e
HX
2492 if (req->ctx != *ctx) {
2493 if (unlikely(!*uring_locked && *ctx))
2494 ctx_commit_and_unlock(*ctx);
2495
2496 ctx_flush_and_put(*ctx, uring_locked);
2497 *ctx = req->ctx;
2498 /* if not contended, grab and improve batching */
2499 *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
2500 percpu_ref_get(&(*ctx)->refs);
2501 if (unlikely(!*uring_locked))
2502 spin_lock(&(*ctx)->completion_lock);
2503 }
2504 if (likely(*uring_locked))
2505 req->io_task_work.func(req, uring_locked);
2506 else
cef216fc 2507 __io_req_complete_post(req, req->cqe.res,
cc3cec83 2508 io_put_kbuf_comp(req));
f28c240e
HX
2509 node = next;
2510 } while (node);
2511
2512 if (unlikely(!*uring_locked))
2513 ctx_commit_and_unlock(*ctx);
2514}
2515
2516static void handle_tw_list(struct io_wq_work_node *node,
2517 struct io_ring_ctx **ctx, bool *locked)
9f8d032a
HX
2518{
2519 do {
2520 struct io_wq_work_node *next = node->next;
2521 struct io_kiocb *req = container_of(node, struct io_kiocb,
2522 io_task_work.node);
2523
34d2bfe7
JA
2524 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2525
9f8d032a
HX
2526 if (req->ctx != *ctx) {
2527 ctx_flush_and_put(*ctx, locked);
2528 *ctx = req->ctx;
2529 /* if not contended, grab and improve batching */
2530 *locked = mutex_trylock(&(*ctx)->uring_lock);
2531 percpu_ref_get(&(*ctx)->refs);
2532 }
2533 req->io_task_work.func(req, locked);
2534 node = next;
2535 } while (node);
2536}
2537
7cbf1722 2538static void tctx_task_work(struct callback_head *cb)
c40f6379 2539{
f28c240e 2540 bool uring_locked = false;
ebd0df2e 2541 struct io_ring_ctx *ctx = NULL;
3f18407d
PB
2542 struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2543 task_work);
c40f6379 2544
16f72070 2545 while (1) {
f28c240e 2546 struct io_wq_work_node *node1, *node2;
3f18407d
PB
2547
2548 spin_lock_irq(&tctx->task_lock);
f28c240e
HX
2549 node1 = tctx->prior_task_list.first;
2550 node2 = tctx->task_list.first;
3f18407d 2551 INIT_WQ_LIST(&tctx->task_list);
f28c240e
HX
2552 INIT_WQ_LIST(&tctx->prior_task_list);
2553 if (!node2 && !node1)
6294f368 2554 tctx->task_running = false;
3f18407d 2555 spin_unlock_irq(&tctx->task_lock);
f28c240e 2556 if (!node2 && !node1)
6294f368 2557 break;
3f18407d 2558
f28c240e
HX
2559 if (node1)
2560 handle_prev_tw_list(node1, &ctx, &uring_locked);
f28c240e
HX
2561 if (node2)
2562 handle_tw_list(node2, &ctx, &uring_locked);
7cbf1722 2563 cond_resched();
68ca8fc0 2564
a6d97a8a
PB
2565 if (data_race(!tctx->task_list.first) &&
2566 data_race(!tctx->prior_task_list.first) && uring_locked)
68ca8fc0 2567 io_submit_flush_completions(ctx);
3f18407d 2568 }
ebd0df2e 2569
f28c240e 2570 ctx_flush_and_put(ctx, &uring_locked);
3cc7fdb9
PB
2571
2572 /* relaxed read is enough as only the task itself sets ->in_idle */
2573 if (unlikely(atomic_read(&tctx->in_idle)))
2574 io_uring_drop_tctx_refs(current);
7cbf1722
JA
2575}
2576
4813c377 2577static void io_req_task_work_add(struct io_kiocb *req, bool priority)
7cbf1722 2578{
c15b79de 2579 struct task_struct *tsk = req->task;
7cbf1722 2580 struct io_uring_task *tctx = tsk->io_uring;
c15b79de 2581 enum task_work_notify_mode notify;
e09ee510 2582 struct io_wq_work_node *node;
0b81e80c 2583 unsigned long flags;
6294f368 2584 bool running;
7cbf1722
JA
2585
2586 WARN_ON_ONCE(!tctx);
2587
d5361233
JA
2588 io_drop_inflight_file(req);
2589
0b81e80c 2590 spin_lock_irqsave(&tctx->task_lock, flags);
4813c377
HX
2591 if (priority)
2592 wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
2593 else
2594 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
6294f368
PB
2595 running = tctx->task_running;
2596 if (!running)
2597 tctx->task_running = true;
0b81e80c 2598 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
2599
2600 /* task_work already pending, we're done */
6294f368 2601 if (running)
e09ee510 2602 return;
7cbf1722 2603
c15b79de
PB
2604 /*
2605 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2606 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2607 * processing task_work. There's no reliable way to tell if TWA_RESUME
2608 * will do the job.
2609 */
2610 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
d97ec623
PB
2611 if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
2612 if (notify == TWA_NONE)
2613 wake_up_process(tsk);
e09ee510 2614 return;
c15b79de 2615 }
2215bed9 2616
0b81e80c 2617 spin_lock_irqsave(&tctx->task_lock, flags);
6294f368 2618 tctx->task_running = false;
4813c377 2619 node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
0b81e80c 2620 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722 2621
e09ee510
PB
2622 while (node) {
2623 req = container_of(node, struct io_kiocb, io_task_work.node);
2624 node = node->next;
2625 if (llist_add(&req->io_task_work.fallback_node,
2626 &req->ctx->fallback_llist))
2627 schedule_delayed_work(&req->ctx->fallback_work, 1);
2628 }
eab30c4d
PB
2629}
2630
4e118cd9
PB
2631static void io_req_tw_post(struct io_kiocb *req, bool *locked)
2632{
2633 io_req_complete_post(req, req->cqe.res, req->cqe.flags);
2634}
2635
2636static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
2637{
2638 req->cqe.res = res;
2639 req->cqe.flags = cflags;
2640 req->io_task_work.func = io_req_tw_post;
2641 io_req_task_work_add(req, false);
2642}
2643
f237c30a 2644static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
c40f6379 2645{
b18a1a45 2646 /* not needed for normal modes, but SQPOLL depends on it */
971cf9c1 2647 io_tw_lock(req->ctx, locked);
cef216fc 2648 io_req_complete_failed(req, req->cqe.res);
c40f6379
JA
2649}
2650
f237c30a 2651static void io_req_task_submit(struct io_kiocb *req, bool *locked)
c40f6379 2652{
971cf9c1 2653 io_tw_lock(req->ctx, locked);
316319e8 2654 /* req->task == current here, checking PF_EXITING is safe */
af066f31 2655 if (likely(!(req->task->flags & PF_EXITING)))
cbc2e203 2656 io_queue_sqe(req);
81b6d05c 2657 else
2593553a 2658 io_req_complete_failed(req, -EFAULT);
c40f6379
JA
2659}
2660
2c4b8eb6 2661static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
c40f6379 2662{
cef216fc 2663 req->cqe.res = ret;
5b0a6acc 2664 req->io_task_work.func = io_req_task_cancel;
4813c377 2665 io_req_task_work_add(req, false);
c40f6379
JA
2666}
2667
2c4b8eb6 2668static void io_req_task_queue(struct io_kiocb *req)
a3df7698 2669{
5b0a6acc 2670 req->io_task_work.func = io_req_task_submit;
4813c377 2671 io_req_task_work_add(req, false);
a3df7698
PB
2672}
2673
773af691
JA
2674static void io_req_task_queue_reissue(struct io_kiocb *req)
2675{
77955efb 2676 req->io_task_work.func = io_queue_iowq;
4813c377 2677 io_req_task_work_add(req, false);
773af691
JA
2678}
2679
57859f4d 2680static void io_queue_next(struct io_kiocb *req)
c69f8dbe 2681{
57859f4d 2682 struct io_kiocb *nxt = io_req_find_next(req);
944e58bf 2683
57859f4d
PB
2684 if (nxt)
2685 io_req_task_queue(nxt);
c69f8dbe
JL
2686}
2687
3aa83bfb 2688static void io_free_batch_list(struct io_ring_ctx *ctx,
1cce17ac 2689 struct io_wq_work_node *node)
3aa83bfb 2690 __must_hold(&ctx->uring_lock)
5af1d13e 2691{
d4b7a5ef 2692 struct task_struct *task = NULL;
37f0e767 2693 int task_refs = 0;
5af1d13e 2694
3aa83bfb
PB
2695 do {
2696 struct io_kiocb *req = container_of(node, struct io_kiocb,
2697 comp_list);
2d6500d4 2698
a538be5b
PB
2699 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
2700 if (req->flags & REQ_F_REFCOUNT) {
2701 node = req->comp_list.next;
2702 if (!req_ref_put_and_test(req))
2703 continue;
2704 }
b605a7fa
PB
2705 if ((req->flags & REQ_F_POLLED) && req->apoll) {
2706 struct async_poll *apoll = req->apoll;
2707
2708 if (apoll->double_poll)
2709 kfree(apoll->double_poll);
2710 list_add(&apoll->poll.wait.entry,
2711 &ctx->apoll_cache);
2712 req->flags &= ~REQ_F_POLLED;
2713 }
da1a08c5 2714 if (req->flags & IO_REQ_LINK_FLAGS)
57859f4d 2715 io_queue_next(req);
a538be5b
PB
2716 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
2717 io_clean_op(req);
c1e53a69 2718 }
a538be5b
PB
2719 if (!(req->flags & REQ_F_FIXED_FILE))
2720 io_put_file(req->file);
2d6500d4 2721
ab409402 2722 io_req_put_rsrc_locked(req, ctx);
5af1d13e 2723
d4b7a5ef
PB
2724 if (req->task != task) {
2725 if (task)
2726 io_put_task(task, task_refs);
2727 task = req->task;
2728 task_refs = 0;
2729 }
2730 task_refs++;
c1e53a69 2731 node = req->comp_list.next;
fa05457a 2732 io_req_add_to_cache(req, ctx);
3aa83bfb 2733 } while (node);
d4b7a5ef 2734
d4b7a5ef
PB
2735 if (task)
2736 io_put_task(task, task_refs);
7a743e22
PB
2737}
2738
c450178d 2739static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
a141dd89 2740 __must_hold(&ctx->uring_lock)
905c172f 2741{
6f33b0bc 2742 struct io_wq_work_node *node, *prev;
cd0ca2e0 2743 struct io_submit_state *state = &ctx->submit_state;
905c172f 2744
3d4aeb9f
PB
2745 if (state->flush_cqes) {
2746 spin_lock(&ctx->completion_lock);
2747 wq_list_for_each(node, prev, &state->compl_reqs) {
2748 struct io_kiocb *req = container_of(node, struct io_kiocb,
6f33b0bc 2749 comp_list);
5182ed2e 2750
3d4aeb9f 2751 if (!(req->flags & REQ_F_CQE_SKIP))
90e7c35f 2752 __io_fill_cqe_req_filled(ctx, req);
3d4aeb9f
PB
2753 }
2754
2755 io_commit_cqring(ctx);
2756 spin_unlock(&ctx->completion_lock);
2757 io_cqring_ev_posted(ctx);
2758 state->flush_cqes = false;
905c172f 2759 }
5182ed2e 2760
1cce17ac 2761 io_free_batch_list(ctx, state->compl_reqs.first);
6f33b0bc 2762 INIT_WQ_LIST(&state->compl_reqs);
7a743e22
PB
2763}
2764
ba816ad6
JA
2765/*
2766 * Drop reference to request, return next in chain (if there is one) if this
2767 * was the last reference to this request.
2768 */
0d85035a 2769static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
e65ef56d 2770{
9b5f7bd9
PB
2771 struct io_kiocb *nxt = NULL;
2772
de9b4cca 2773 if (req_ref_put_and_test(req)) {
da1a08c5 2774 if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
7819a1f6 2775 nxt = io_req_find_next(req);
f5c6cf2a 2776 io_free_req(req);
2a44f467 2777 }
9b5f7bd9 2778 return nxt;
2b188cc1
JA
2779}
2780
0d85035a 2781static inline void io_put_req(struct io_kiocb *req)
e65ef56d 2782{
f5c6cf2a
PB
2783 if (req_ref_put_and_test(req)) {
2784 io_queue_next(req);
e65ef56d 2785 io_free_req(req);
f5c6cf2a 2786 }
2b188cc1
JA
2787}
2788
6c503150 2789static unsigned io_cqring_events(struct io_ring_ctx *ctx)
a3a0e43f
JA
2790{
2791 /* See comment at the top of this file */
2792 smp_rmb();
e23de15f 2793 return __io_cqring_events(ctx);
a3a0e43f
JA
2794}
2795
fb5ccc98
PB
2796static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2797{
2798 struct io_rings *rings = ctx->rings;
2799
2800 /* make sure SQ entry isn't read before tail */
2801 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2802}
2803
4c6e277c
JA
2804static inline bool io_run_task_work(void)
2805{
7f62d40d 2806 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
4c6e277c 2807 __set_current_state(TASK_RUNNING);
7c5d8fa6
EB
2808 clear_notify_signal();
2809 if (task_work_pending(current))
2810 task_work_run();
4c6e277c
JA
2811 return true;
2812 }
2813
2814 return false;
bcda7baa
JA
2815}
2816
5ba3c874 2817static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
def596e9 2818{
5eef4e87 2819 struct io_wq_work_node *pos, *start, *prev;
d729cf9a 2820 unsigned int poll_flags = BLK_POLL_NOSLEEP;
b688f11e 2821 DEFINE_IO_COMP_BATCH(iob);
5ba3c874 2822 int nr_events = 0;
def596e9
JA
2823
2824 /*
2825 * Only spin for completions if we don't have multiple devices hanging
87a115fb 2826 * off our complete list.
def596e9 2827 */
87a115fb 2828 if (ctx->poll_multi_queue || force_nonspin)
ef99b2d3 2829 poll_flags |= BLK_POLL_ONESHOT;
def596e9 2830
5eef4e87
PB
2831 wq_list_for_each(pos, start, &ctx->iopoll_list) {
2832 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
9adbd45d 2833 struct kiocb *kiocb = &req->rw.kiocb;
a2416e1e 2834 int ret;
def596e9
JA
2835
2836 /*
581f9810
BM
2837 * Move completed and retryable entries to our local lists.
2838 * If we find a request that requires polling, break out
2839 * and complete those lists first, if we have entries there.
def596e9 2840 */
e3f721e6 2841 if (READ_ONCE(req->iopoll_completed))
def596e9
JA
2842 break;
2843
b688f11e 2844 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
a2416e1e
PB
2845 if (unlikely(ret < 0))
2846 return ret;
2847 else if (ret)
ef99b2d3 2848 poll_flags |= BLK_POLL_ONESHOT;
def596e9 2849
3aadc23e 2850 /* iopoll may have completed current req */
b688f11e
JA
2851 if (!rq_list_empty(iob.req_list) ||
2852 READ_ONCE(req->iopoll_completed))
e3f721e6 2853 break;
def596e9
JA
2854 }
2855
b688f11e
JA
2856 if (!rq_list_empty(iob.req_list))
2857 iob.complete(&iob);
5eef4e87
PB
2858 else if (!pos)
2859 return 0;
def596e9 2860
5eef4e87
PB
2861 prev = start;
2862 wq_list_for_each_resume(pos, prev) {
2863 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2864
b3fa03fd
PB
2865 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
2866 if (!smp_load_acquire(&req->iopoll_completed))
e3f721e6 2867 break;
c0713540 2868 nr_events++;
83a13a41
PB
2869 if (unlikely(req->flags & REQ_F_CQE_SKIP))
2870 continue;
cef216fc 2871 __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
e3f721e6 2872 }
def596e9 2873
f5ed3bcd
PB
2874 if (unlikely(!nr_events))
2875 return 0;
2876
2877 io_commit_cqring(ctx);
2878 io_cqring_ev_posted_iopoll(ctx);
1cce17ac 2879 pos = start ? start->next : ctx->iopoll_list.first;
5eef4e87 2880 wq_list_cut(&ctx->iopoll_list, prev, start);
1cce17ac 2881 io_free_batch_list(ctx, pos);
5ba3c874 2882 return nr_events;
def596e9
JA
2883}
2884
def596e9
JA
2885/*
2886 * We can't just wait for polled events to come to us, we have to actively
2887 * find and complete them.
2888 */
c072481d 2889static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
def596e9
JA
2890{
2891 if (!(ctx->flags & IORING_SETUP_IOPOLL))
2892 return;
2893
2894 mutex_lock(&ctx->uring_lock);
5eef4e87 2895 while (!wq_list_empty(&ctx->iopoll_list)) {
b2edc0a7 2896 /* let it sleep and repeat later if can't complete a request */
5ba3c874 2897 if (io_do_iopoll(ctx, true) == 0)
b2edc0a7 2898 break;
08f5439f
JA
2899 /*
2900 * Ensure we allow local-to-the-cpu processing to take place,
2901 * in this case we need to ensure that we reap all events.
3fcee5a6 2902 * Also let task_work, etc. to progress by releasing the mutex
08f5439f 2903 */
3fcee5a6
PB
2904 if (need_resched()) {
2905 mutex_unlock(&ctx->uring_lock);
2906 cond_resched();
2907 mutex_lock(&ctx->uring_lock);
2908 }
def596e9
JA
2909 }
2910 mutex_unlock(&ctx->uring_lock);
2911}
2912
7668b92a 2913static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
def596e9 2914{
7668b92a 2915 unsigned int nr_events = 0;
e9979b36 2916 int ret = 0;
500f9fba 2917
f39c8a5b
PB
2918 /*
2919 * Don't enter poll loop if we already have events pending.
2920 * If we do, we can potentially be spinning for commands that
2921 * already triggered a CQE (eg in error).
2922 */
5ed7a37d 2923 if (test_bit(0, &ctx->check_cq_overflow))
f39c8a5b
PB
2924 __io_cqring_overflow_flush(ctx, false);
2925 if (io_cqring_events(ctx))
d487b43c 2926 return 0;
def596e9 2927 do {
500f9fba
JA
2928 /*
2929 * If a submit got punted to a workqueue, we can have the
2930 * application entering polling for a command before it gets
2931 * issued. That app will hold the uring_lock for the duration
2932 * of the poll right here, so we need to take a breather every
2933 * now and then to ensure that the issue has a chance to add
2934 * the poll to the issued list. Otherwise we can spin here
2935 * forever, while the workqueue is stuck trying to acquire the
2936 * very same mutex.
2937 */
5eef4e87 2938 if (wq_list_empty(&ctx->iopoll_list)) {
8f487ef2
PB
2939 u32 tail = ctx->cached_cq_tail;
2940
500f9fba 2941 mutex_unlock(&ctx->uring_lock);
4c6e277c 2942 io_run_task_work();
500f9fba 2943 mutex_lock(&ctx->uring_lock);
def596e9 2944
8f487ef2
PB
2945 /* some requests don't go through iopoll_list */
2946 if (tail != ctx->cached_cq_tail ||
5eef4e87 2947 wq_list_empty(&ctx->iopoll_list))
e9979b36 2948 break;
500f9fba 2949 }
5ba3c874
PB
2950 ret = io_do_iopoll(ctx, !min);
2951 if (ret < 0)
2952 break;
2953 nr_events += ret;
2954 ret = 0;
2955 } while (nr_events < min && !need_resched());
d487b43c 2956
def596e9
JA
2957 return ret;
2958}
2959
491381ce 2960static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 2961{
491381ce
JA
2962 /*
2963 * Tell lockdep we inherited freeze protection from submission
2964 * thread.
2965 */
2966 if (req->flags & REQ_F_ISREG) {
1c98679d 2967 struct super_block *sb = file_inode(req->file)->i_sb;
2b188cc1 2968
1c98679d
PB
2969 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2970 sb_end_write(sb);
2b188cc1
JA
2971 }
2972}
2973
b63534c4 2974#ifdef CONFIG_BLOCK
dc2a6e9a 2975static bool io_resubmit_prep(struct io_kiocb *req)
b63534c4 2976{
ab454438 2977 struct io_async_rw *rw = req->async_data;
b63534c4 2978
d886e185 2979 if (!req_has_async_data(req))
ab454438 2980 return !io_req_prep_async(req);
538941e2 2981 iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
ab454438 2982 return true;
b63534c4 2983}
b63534c4 2984
3e6a0d3c 2985static bool io_rw_should_reissue(struct io_kiocb *req)
b63534c4 2986{
355afaeb 2987 umode_t mode = file_inode(req->file)->i_mode;
3e6a0d3c 2988 struct io_ring_ctx *ctx = req->ctx;
b63534c4 2989
355afaeb
JA
2990 if (!S_ISBLK(mode) && !S_ISREG(mode))
2991 return false;
3e6a0d3c
JA
2992 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2993 !(ctx->flags & IORING_SETUP_IOPOLL)))
b63534c4 2994 return false;
7c977a58
JA
2995 /*
2996 * If ref is dying, we might be running poll reap from the exit work.
2997 * Don't attempt to reissue from that path, just let it fail with
2998 * -EAGAIN.
2999 */
3e6a0d3c
JA
3000 if (percpu_ref_is_dying(&ctx->refs))
3001 return false;
ef046888
JA
3002 /*
3003 * Play it safe and assume not safe to re-import and reissue if we're
3004 * not in the original thread group (or in task context).
3005 */
3006 if (!same_thread_group(req->task, current) || !in_task())
3007 return false;
3e6a0d3c
JA
3008 return true;
3009}
e82ad485 3010#else
a1ff1e3f 3011static bool io_resubmit_prep(struct io_kiocb *req)
e82ad485
JA
3012{
3013 return false;
3014}
e82ad485 3015static bool io_rw_should_reissue(struct io_kiocb *req)
3e6a0d3c 3016{
b63534c4
JA
3017 return false;
3018}
3e6a0d3c 3019#endif
b63534c4 3020
8ef12efe 3021static bool __io_complete_rw_common(struct io_kiocb *req, long res)
a1d7c393 3022{
f63cf519 3023 if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
b65c128f 3024 kiocb_end_write(req);
f63cf519
JA
3025 fsnotify_modify(req->file);
3026 } else {
3027 fsnotify_access(req->file);
3028 }
cef216fc 3029 if (unlikely(res != req->cqe.res)) {
9532b99b
PB
3030 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
3031 io_rw_should_reissue(req)) {
3032 req->flags |= REQ_F_REISSUE;
8ef12efe 3033 return true;
9532b99b 3034 }
93d2bcd2 3035 req_set_fail(req);
cef216fc 3036 req->cqe.res = res;
9532b99b 3037 }
8ef12efe
JA
3038 return false;
3039}
3040
cc8e9ba7 3041static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
8ef12efe 3042{
cef216fc 3043 int res = req->cqe.res;
126180b9
PB
3044
3045 if (*locked) {
cc3cec83 3046 io_req_complete_state(req, res, io_put_kbuf(req, 0));
fff4e40e 3047 io_req_add_compl_list(req);
126180b9 3048 } else {
cc3cec83
JA
3049 io_req_complete_post(req, res,
3050 io_put_kbuf(req, IO_URING_F_UNLOCKED));
126180b9 3051 }
8ef12efe
JA
3052}
3053
00f6e68b 3054static void __io_complete_rw(struct io_kiocb *req, long res,
8ef12efe
JA
3055 unsigned int issue_flags)
3056{
3057 if (__io_complete_rw_common(req, res))
3058 return;
cef216fc 3059 __io_req_complete(req, issue_flags, req->cqe.res,
cc3cec83 3060 io_put_kbuf(req, issue_flags));
ba816ad6
JA
3061}
3062
6b19b766 3063static void io_complete_rw(struct kiocb *kiocb, long res)
ba816ad6 3064{
9adbd45d 3065 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6 3066
8ef12efe
JA
3067 if (__io_complete_rw_common(req, res))
3068 return;
cef216fc 3069 req->cqe.res = res;
8ef12efe 3070 req->io_task_work.func = io_req_task_complete;
f28c240e 3071 io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
2b188cc1
JA
3072}
3073
6b19b766 3074static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
def596e9 3075{
9adbd45d 3076 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 3077
491381ce
JA
3078 if (kiocb->ki_flags & IOCB_WRITE)
3079 kiocb_end_write(req);
cef216fc 3080 if (unlikely(res != req->cqe.res)) {
b66ceaf3
PB
3081 if (res == -EAGAIN && io_rw_should_reissue(req)) {
3082 req->flags |= REQ_F_REISSUE;
3083 return;
9532b99b 3084 }
cef216fc 3085 req->cqe.res = res;
8c130827 3086 }
bbde017a 3087
b3fa03fd
PB
3088 /* order with io_iopoll_complete() checking ->iopoll_completed */
3089 smp_store_release(&req->iopoll_completed, 1);
def596e9
JA
3090}
3091
3092/*
3093 * After the iocb has been issued, it's safe to be found on the poll list.
3094 * Adding the kiocb to the list AFTER submission ensures that we don't
f39c8a5b 3095 * find it from a io_do_iopoll() thread before the issuer is done
def596e9
JA
3096 * accessing the kiocb cookie.
3097 */
9882131c 3098static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
def596e9
JA
3099{
3100 struct io_ring_ctx *ctx = req->ctx;
3b44b371 3101 const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
cb3d8972
PB
3102
3103 /* workqueue context doesn't hold uring_lock, grab it now */
3b44b371 3104 if (unlikely(needs_lock))
cb3d8972 3105 mutex_lock(&ctx->uring_lock);
def596e9
JA
3106
3107 /*
3108 * Track whether we have multiple files in our lists. This will impact
3109 * how we do polling eventually, not spinning if we're on potentially
3110 * different devices.
3111 */
5eef4e87 3112 if (wq_list_empty(&ctx->iopoll_list)) {
915b3dde
HX
3113 ctx->poll_multi_queue = false;
3114 } else if (!ctx->poll_multi_queue) {
def596e9
JA
3115 struct io_kiocb *list_req;
3116
5eef4e87
PB
3117 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
3118 comp_list);
30da1b45 3119 if (list_req->file != req->file)
915b3dde 3120 ctx->poll_multi_queue = true;
def596e9
JA
3121 }
3122
3123 /*
3124 * For fast devices, IO may have already completed. If it has, add
3125 * it to the front so we find it first.
3126 */
65a6543d 3127 if (READ_ONCE(req->iopoll_completed))
5eef4e87 3128 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
def596e9 3129 else
5eef4e87 3130 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
bdcd3eab 3131
3b44b371 3132 if (unlikely(needs_lock)) {
cb3d8972
PB
3133 /*
3134 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
3135 * in sq thread task context or in io worker task context. If
3136 * current task context is sq thread, we don't need to check
3137 * whether should wake up sq thread.
3138 */
3139 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
3140 wq_has_sleeper(&ctx->sq_data->wait))
3141 wake_up(&ctx->sq_data->wait);
3142
3143 mutex_unlock(&ctx->uring_lock);
3144 }
def596e9
JA
3145}
3146
4503b767
JA
3147static bool io_bdev_nowait(struct block_device *bdev)
3148{
9ba0d0c8 3149 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
4503b767
JA
3150}
3151
2b188cc1
JA
3152/*
3153 * If we tracked the file through the SCM inflight mechanism, we could support
3154 * any file. For now, just ensure that anything potentially problematic is done
3155 * inline.
3156 */
88459b50 3157static bool __io_file_supports_nowait(struct file *file, umode_t mode)
2b188cc1 3158{
4503b767 3159 if (S_ISBLK(mode)) {
4e7b5671
CH
3160 if (IS_ENABLED(CONFIG_BLOCK) &&
3161 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
4503b767
JA
3162 return true;
3163 return false;
3164 }
976517f1 3165 if (S_ISSOCK(mode))
2b188cc1 3166 return true;
4503b767 3167 if (S_ISREG(mode)) {
4e7b5671
CH
3168 if (IS_ENABLED(CONFIG_BLOCK) &&
3169 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
4503b767
JA
3170 file->f_op != &io_uring_fops)
3171 return true;
3172 return false;
3173 }
2b188cc1 3174
c5b85625
JA
3175 /* any ->read/write should understand O_NONBLOCK */
3176 if (file->f_flags & O_NONBLOCK)
3177 return true;
35645ac3 3178 return file->f_mode & FMODE_NOWAIT;
2b188cc1 3179}
c5b85625 3180
88459b50
PB
3181/*
3182 * If we tracked the file through the SCM inflight mechanism, we could support
3183 * any file. For now, just ensure that anything potentially problematic is done
3184 * inline.
3185 */
3186static unsigned int io_file_get_flags(struct file *file)
3187{
3188 umode_t mode = file_inode(file)->i_mode;
3189 unsigned int res = 0;
af197f50 3190
88459b50
PB
3191 if (S_ISREG(mode))
3192 res |= FFS_ISREG;
3193 if (__io_file_supports_nowait(file, mode))
3194 res |= FFS_NOWAIT;
3195 return res;
2b188cc1
JA
3196}
3197
35645ac3 3198static inline bool io_file_supports_nowait(struct io_kiocb *req)
7b29f92d 3199{
88459b50 3200 return req->flags & REQ_F_SUPPORT_NOWAIT;
7b29f92d
JA
3201}
3202
b9a6b8f9 3203static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 3204{
9adbd45d 3205 struct kiocb *kiocb = &req->rw.kiocb;
09bb8394
JA
3206 unsigned ioprio;
3207 int ret;
2b188cc1 3208
2b188cc1 3209 kiocb->ki_pos = READ_ONCE(sqe->off);
9adbd45d 3210
fb27274a
PB
3211 ioprio = READ_ONCE(sqe->ioprio);
3212 if (ioprio) {
3213 ret = ioprio_check_cap(ioprio);
3214 if (ret)
3215 return ret;
3216
3217 kiocb->ki_ioprio = ioprio;
3218 } else {
3219 kiocb->ki_ioprio = get_current_ioprio();
eae071c9
PB
3220 }
3221
578c0ee2 3222 req->imu = NULL;
3529d8c2
JA
3223 req->rw.addr = READ_ONCE(sqe->addr);
3224 req->rw.len = READ_ONCE(sqe->len);
584b0180 3225 req->rw.flags = READ_ONCE(sqe->rw_flags);
4f4eeba8 3226 req->buf_index = READ_ONCE(sqe->buf_index);
2b188cc1 3227 return 0;
2b188cc1
JA
3228}
3229
3230static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
3231{
3232 switch (ret) {
3233 case -EIOCBQUEUED:
3234 break;
3235 case -ERESTARTSYS:
3236 case -ERESTARTNOINTR:
3237 case -ERESTARTNOHAND:
3238 case -ERESTART_RESTARTBLOCK:
3239 /*
3240 * We can't just restart the syscall, since previously
3241 * submitted sqes may already be in progress. Just fail this
3242 * IO with EINTR.
3243 */
3244 ret = -EINTR;
df561f66 3245 fallthrough;
2b188cc1 3246 default:
6b19b766 3247 kiocb->ki_complete(kiocb, ret);
2b188cc1
JA
3248 }
3249}
3250
b4aec400 3251static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
d34e1e5b
DY
3252{
3253 struct kiocb *kiocb = &req->rw.kiocb;
3254
6f83ab22
JA
3255 if (kiocb->ki_pos != -1)
3256 return &kiocb->ki_pos;
3257
3258 if (!(req->file->f_mode & FMODE_STREAM)) {
3259 req->flags |= REQ_F_CUR_POS;
3260 kiocb->ki_pos = req->file->f_pos;
3261 return &kiocb->ki_pos;
d34e1e5b 3262 }
6f83ab22
JA
3263
3264 kiocb->ki_pos = 0;
3265 return NULL;
d34e1e5b
DY
3266}
3267
2ea537ca 3268static void kiocb_done(struct io_kiocb *req, ssize_t ret,
889fca73 3269 unsigned int issue_flags)
ba816ad6 3270{
e8c2bc1f 3271 struct io_async_rw *io = req->async_data;
ba04291e 3272
227c0c96 3273 /* add previously done IO, if any */
d886e185 3274 if (req_has_async_data(req) && io->bytes_done > 0) {
227c0c96 3275 if (ret < 0)
e8c2bc1f 3276 ret = io->bytes_done;
227c0c96 3277 else
e8c2bc1f 3278 ret += io->bytes_done;
227c0c96
JA
3279 }
3280
ba04291e 3281 if (req->flags & REQ_F_CUR_POS)
2ea537ca
PB
3282 req->file->f_pos = req->rw.kiocb.ki_pos;
3283 if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
00f6e68b 3284 __io_complete_rw(req, ret, issue_flags);
ba816ad6 3285 else
2ea537ca 3286 io_rw_done(&req->rw.kiocb, ret);
97284637 3287
b66ceaf3 3288 if (req->flags & REQ_F_REISSUE) {
97284637 3289 req->flags &= ~REQ_F_REISSUE;
b91ef187 3290 if (io_resubmit_prep(req))
773af691 3291 io_req_task_queue_reissue(req);
b91ef187
PB
3292 else
3293 io_req_task_queue_fail(req, ret);
97284637 3294 }
ba816ad6
JA
3295}
3296
eae071c9
PB
3297static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3298 struct io_mapped_ubuf *imu)
edafccee 3299{
9adbd45d 3300 size_t len = req->rw.len;
75769e3f 3301 u64 buf_end, buf_addr = req->rw.addr;
edafccee 3302 size_t offset;
edafccee 3303
75769e3f 3304 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
edafccee
JA
3305 return -EFAULT;
3306 /* not inside the mapped region */
4751f53d 3307 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
edafccee
JA
3308 return -EFAULT;
3309
3310 /*
3311 * May not be a start of buffer, set size appropriately
3312 * and advance us to the beginning.
3313 */
3314 offset = buf_addr - imu->ubuf;
3315 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
3316
3317 if (offset) {
3318 /*
3319 * Don't use iov_iter_advance() here, as it's really slow for
3320 * using the latter parts of a big fixed buffer - it iterates
3321 * over each segment manually. We can cheat a bit here, because
3322 * we know that:
3323 *
3324 * 1) it's a BVEC iter, we set it up
3325 * 2) all bvecs are PAGE_SIZE in size, except potentially the
3326 * first and last bvec
3327 *
3328 * So just find our index, and adjust the iterator afterwards.
3329 * If the offset is within the first bvec (or the whole first
3330 * bvec, just use iov_iter_advance(). This makes it easier
3331 * since we can just skip the first segment, which may not
3332 * be PAGE_SIZE aligned.
3333 */
3334 const struct bio_vec *bvec = imu->bvec;
3335
3336 if (offset <= bvec->bv_len) {
3337 iov_iter_advance(iter, offset);
3338 } else {
3339 unsigned long seg_skip;
3340
3341 /* skip first vec */
3342 offset -= bvec->bv_len;
3343 seg_skip = 1 + (offset >> PAGE_SHIFT);
3344
3345 iter->bvec = bvec + seg_skip;
3346 iter->nr_segs -= seg_skip;
99c79f66 3347 iter->count -= bvec->bv_len + offset;
bd11b3a3 3348 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
3349 }
3350 }
3351
847595de 3352 return 0;
edafccee
JA
3353}
3354
5106dd6e
JA
3355static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3356 unsigned int issue_flags)
eae071c9 3357{
eae071c9
PB
3358 struct io_mapped_ubuf *imu = req->imu;
3359 u16 index, buf_index = req->buf_index;
3360
3361 if (likely(!imu)) {
578c0ee2
PB
3362 struct io_ring_ctx *ctx = req->ctx;
3363
eae071c9
PB
3364 if (unlikely(buf_index >= ctx->nr_user_bufs))
3365 return -EFAULT;
5106dd6e 3366 io_req_set_rsrc_node(req, ctx, issue_flags);
eae071c9
PB
3367 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3368 imu = READ_ONCE(ctx->user_bufs[index]);
3369 req->imu = imu;
3370 }
3371 return __io_import_fixed(req, rw, iter, imu);
3372}
3373
dbc7d452
JA
3374static void io_buffer_add_list(struct io_ring_ctx *ctx,
3375 struct io_buffer_list *bl, unsigned int bgid)
3376{
3377 struct list_head *list;
3378
3379 list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
3380 INIT_LIST_HEAD(&bl->buf_list);
3381 bl->bgid = bgid;
3382 list_add(&bl->list, list);
3383}
3384
bcda7baa 3385static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
51aac424 3386 int bgid, unsigned int issue_flags)
bcda7baa 3387{
30d51dd4 3388 struct io_buffer *kbuf = req->kbuf;
dbc7d452
JA
3389 struct io_ring_ctx *ctx = req->ctx;
3390 struct io_buffer_list *bl;
bcda7baa
JA
3391
3392 if (req->flags & REQ_F_BUFFER_SELECTED)
3393 return kbuf;
3394
f8929630 3395 io_ring_submit_lock(req->ctx, issue_flags);
bcda7baa 3396
dbc7d452
JA
3397 bl = io_buffer_get_list(ctx, bgid);
3398 if (bl && !list_empty(&bl->buf_list)) {
3399 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
3400 list_del(&kbuf->list);
bcda7baa
JA
3401 if (*len > kbuf->len)
3402 *len = kbuf->len;
30d51dd4
PB
3403 req->flags |= REQ_F_BUFFER_SELECTED;
3404 req->kbuf = kbuf;
bcda7baa
JA
3405 } else {
3406 kbuf = ERR_PTR(-ENOBUFS);
3407 }
3408
f8929630 3409 io_ring_submit_unlock(req->ctx, issue_flags);
bcda7baa
JA
3410 return kbuf;
3411}
3412
4d954c25 3413static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
51aac424 3414 unsigned int issue_flags)
4d954c25
JA
3415{
3416 struct io_buffer *kbuf;
4f4eeba8 3417 u16 bgid;
4d954c25 3418
4f4eeba8 3419 bgid = req->buf_index;
51aac424 3420 kbuf = io_buffer_select(req, len, bgid, issue_flags);
4d954c25
JA
3421 if (IS_ERR(kbuf))
3422 return kbuf;
4d954c25
JA
3423 return u64_to_user_ptr(kbuf->addr);
3424}
3425
3426#ifdef CONFIG_COMPAT
3427static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
51aac424 3428 unsigned int issue_flags)
4d954c25
JA
3429{
3430 struct compat_iovec __user *uiov;
3431 compat_ssize_t clen;
3432 void __user *buf;
3433 ssize_t len;
3434
3435 uiov = u64_to_user_ptr(req->rw.addr);
3436 if (!access_ok(uiov, sizeof(*uiov)))
3437 return -EFAULT;
3438 if (__get_user(clen, &uiov->iov_len))
3439 return -EFAULT;
3440 if (clen < 0)
3441 return -EINVAL;
3442
3443 len = clen;
51aac424 3444 buf = io_rw_buffer_select(req, &len, issue_flags);
4d954c25
JA
3445 if (IS_ERR(buf))
3446 return PTR_ERR(buf);
3447 iov[0].iov_base = buf;
3448 iov[0].iov_len = (compat_size_t) len;
3449 return 0;
3450}
3451#endif
3452
3453static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
51aac424 3454 unsigned int issue_flags)
4d954c25
JA
3455{
3456 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3457 void __user *buf;
3458 ssize_t len;
3459
3460 if (copy_from_user(iov, uiov, sizeof(*uiov)))
3461 return -EFAULT;
3462
3463 len = iov[0].iov_len;
3464 if (len < 0)
3465 return -EINVAL;
51aac424 3466 buf = io_rw_buffer_select(req, &len, issue_flags);
4d954c25
JA
3467 if (IS_ERR(buf))
3468 return PTR_ERR(buf);
3469 iov[0].iov_base = buf;
3470 iov[0].iov_len = len;
3471 return 0;
3472}
3473
3474static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
51aac424 3475 unsigned int issue_flags)
4d954c25 3476{
dddb3e26 3477 if (req->flags & REQ_F_BUFFER_SELECTED) {
30d51dd4 3478 struct io_buffer *kbuf = req->kbuf;
dddb3e26 3479
dddb3e26
JA
3480 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3481 iov[0].iov_len = kbuf->len;
4d954c25 3482 return 0;
dddb3e26 3483 }
dd201662 3484 if (req->rw.len != 1)
4d954c25
JA
3485 return -EINVAL;
3486
3487#ifdef CONFIG_COMPAT
3488 if (req->ctx->compat)
51aac424 3489 return io_compat_import(req, iov, issue_flags);
4d954c25
JA
3490#endif
3491
51aac424 3492 return __io_iov_buffer_select(req, iov, issue_flags);
4d954c25
JA
3493}
3494
caa8fe6e
PB
3495static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
3496 struct io_rw_state *s,
3497 unsigned int issue_flags)
2b188cc1 3498{
5e49c973 3499 struct iov_iter *iter = &s->iter;
847595de 3500 u8 opcode = req->opcode;
caa8fe6e 3501 struct iovec *iovec;
d1d681b0
PB
3502 void __user *buf;
3503 size_t sqe_len;
4d954c25 3504 ssize_t ret;
edafccee 3505
f3251183 3506 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
5106dd6e 3507 ret = io_import_fixed(req, rw, iter, issue_flags);
f3251183
PB
3508 if (ret)
3509 return ERR_PTR(ret);
3510 return NULL;
3511 }
2b188cc1 3512
bcda7baa 3513 /* buffer index only valid with fixed read/write, or buffer select */
d1d681b0 3514 if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
caa8fe6e 3515 return ERR_PTR(-EINVAL);
9adbd45d 3516
d1d681b0
PB
3517 buf = u64_to_user_ptr(req->rw.addr);
3518 sqe_len = req->rw.len;
9adbd45d 3519
3a6820f2 3520 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
bcda7baa 3521 if (req->flags & REQ_F_BUFFER_SELECT) {
51aac424 3522 buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
867a23ea 3523 if (IS_ERR(buf))
898df244 3524 return ERR_CAST(buf);
3f9d6441 3525 req->rw.len = sqe_len;
bcda7baa
JA
3526 }
3527
5e49c973 3528 ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
f3251183
PB
3529 if (ret)
3530 return ERR_PTR(ret);
3531 return NULL;
3a6820f2
JA
3532 }
3533
caa8fe6e 3534 iovec = s->fast_iov;
4d954c25 3535 if (req->flags & REQ_F_BUFFER_SELECT) {
caa8fe6e 3536 ret = io_iov_buffer_select(req, iovec, issue_flags);
f3251183
PB
3537 if (ret)
3538 return ERR_PTR(ret);
3539 iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
3540 return NULL;
4d954c25
JA
3541 }
3542
caa8fe6e 3543 ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
89cd35c5 3544 req->ctx->compat);
caa8fe6e
PB
3545 if (unlikely(ret < 0))
3546 return ERR_PTR(ret);
3547 return iovec;
2b188cc1
JA
3548}
3549
5e49c973
PB
3550static inline int io_import_iovec(int rw, struct io_kiocb *req,
3551 struct iovec **iovec, struct io_rw_state *s,
3552 unsigned int issue_flags)
3553{
caa8fe6e
PB
3554 *iovec = __io_import_iovec(rw, req, s, issue_flags);
3555 if (unlikely(IS_ERR(*iovec)))
3556 return PTR_ERR(*iovec);
5e49c973 3557
5e49c973 3558 iov_iter_save_state(&s->iter, &s->iter_state);
caa8fe6e 3559 return 0;
2b188cc1
JA
3560}
3561
0fef9483
JA
3562static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3563{
5b09e37e 3564 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
0fef9483
JA
3565}
3566
31b51510 3567/*
32960613
JA
3568 * For files that don't have ->read_iter() and ->write_iter(), handle them
3569 * by looping over ->read() or ->write() manually.
31b51510 3570 */
4017eb91 3571static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
32960613 3572{
4017eb91
JA
3573 struct kiocb *kiocb = &req->rw.kiocb;
3574 struct file *file = req->file;
32960613 3575 ssize_t ret = 0;
af9c45ec 3576 loff_t *ppos;
32960613
JA
3577
3578 /*
3579 * Don't support polled IO through this interface, and we can't
3580 * support non-blocking either. For the latter, this just causes
3581 * the kiocb to be handled from an async context.
3582 */
3583 if (kiocb->ki_flags & IOCB_HIPRI)
3584 return -EOPNOTSUPP;
35645ac3
PB
3585 if ((kiocb->ki_flags & IOCB_NOWAIT) &&
3586 !(kiocb->ki_filp->f_flags & O_NONBLOCK))
32960613
JA
3587 return -EAGAIN;
3588
af9c45ec
DY
3589 ppos = io_kiocb_ppos(kiocb);
3590
32960613 3591 while (iov_iter_count(iter)) {
311ae9e1 3592 struct iovec iovec;
32960613
JA
3593 ssize_t nr;
3594
311ae9e1
PB
3595 if (!iov_iter_is_bvec(iter)) {
3596 iovec = iov_iter_iovec(iter);
3597 } else {
4017eb91
JA
3598 iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3599 iovec.iov_len = req->rw.len;
311ae9e1
PB
3600 }
3601
32960613
JA
3602 if (rw == READ) {
3603 nr = file->f_op->read(file, iovec.iov_base,
af9c45ec 3604 iovec.iov_len, ppos);
32960613
JA
3605 } else {
3606 nr = file->f_op->write(file, iovec.iov_base,
af9c45ec 3607 iovec.iov_len, ppos);
32960613
JA
3608 }
3609
3610 if (nr < 0) {
3611 if (!ret)
3612 ret = nr;
3613 break;
3614 }
5e929367 3615 ret += nr;
16c8d2df
JA
3616 if (!iov_iter_is_bvec(iter)) {
3617 iov_iter_advance(iter, nr);
3618 } else {
16c8d2df 3619 req->rw.addr += nr;
5e929367
JA
3620 req->rw.len -= nr;
3621 if (!req->rw.len)
3622 break;
16c8d2df 3623 }
32960613
JA
3624 if (nr != iovec.iov_len)
3625 break;
32960613
JA
3626 }
3627
3628 return ret;
3629}
3630
ff6165b2
JA
3631static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3632 const struct iovec *fast_iov, struct iov_iter *iter)
f67676d1 3633{
e8c2bc1f 3634 struct io_async_rw *rw = req->async_data;
b64e3444 3635
538941e2 3636 memcpy(&rw->s.iter, iter, sizeof(*iter));
afb87658 3637 rw->free_iovec = iovec;
227c0c96 3638 rw->bytes_done = 0;
ff6165b2 3639 /* can only be fixed buffers, no need to do anything */
9c3a205c 3640 if (iov_iter_is_bvec(iter))
ff6165b2 3641 return;
b64e3444 3642 if (!iovec) {
ff6165b2
JA
3643 unsigned iov_off = 0;
3644
538941e2 3645 rw->s.iter.iov = rw->s.fast_iov;
ff6165b2
JA
3646 if (iter->iov != fast_iov) {
3647 iov_off = iter->iov - fast_iov;
538941e2 3648 rw->s.iter.iov += iov_off;
ff6165b2 3649 }
538941e2
PB
3650 if (rw->s.fast_iov != fast_iov)
3651 memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
45097dae 3652 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
3653 } else {
3654 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
3655 }
3656}
3657
8d4af685 3658static inline bool io_alloc_async_data(struct io_kiocb *req)
3d9932a8 3659{
e8c2bc1f
JA
3660 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3661 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
d886e185
PB
3662 if (req->async_data) {
3663 req->flags |= REQ_F_ASYNC_DATA;
3664 return false;
3665 }
3666 return true;
3d9932a8
XW
3667}
3668
ff6165b2 3669static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
c88598a9 3670 struct io_rw_state *s, bool force)
b7bb4f7d 3671{
26f0505a 3672 if (!force && !io_op_defs[req->opcode].needs_async_setup)
74566df3 3673 return 0;
d886e185 3674 if (!req_has_async_data(req)) {
cd658695
JA
3675 struct io_async_rw *iorw;
3676
6cb78689 3677 if (io_alloc_async_data(req)) {
6bf985dc 3678 kfree(iovec);
5d204bcf 3679 return -ENOMEM;
6bf985dc 3680 }
b7bb4f7d 3681
c88598a9 3682 io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
cd658695
JA
3683 iorw = req->async_data;
3684 /* we've copied and mapped the iter, ensure state is saved */
538941e2 3685 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
5d204bcf 3686 }
b7bb4f7d 3687 return 0;
f67676d1
JA
3688}
3689
73debe68 3690static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
c3e330a4 3691{
e8c2bc1f 3692 struct io_async_rw *iorw = req->async_data;
5e49c973 3693 struct iovec *iov;
847595de 3694 int ret;
c3e330a4 3695
51aac424 3696 /* submission path, ->uring_lock should already be taken */
3b44b371 3697 ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
c3e330a4
PB
3698 if (unlikely(ret < 0))
3699 return ret;
3700
ab0b196c
PB
3701 iorw->bytes_done = 0;
3702 iorw->free_iovec = iov;
3703 if (iov)
3704 req->flags |= REQ_F_NEED_CLEANUP;
c3e330a4
PB
3705 return 0;
3706}
3707
c1dd91d1 3708/*
ffdc8dab 3709 * This is our waitqueue callback handler, registered through __folio_lock_async()
c1dd91d1
JA
3710 * when we initially tried to do the IO with the iocb armed our waitqueue.
3711 * This gets called when the page is unlocked, and we generally expect that to
3712 * happen when the page IO is completed and the page is now uptodate. This will
3713 * queue a task_work based retry of the operation, attempting to copy the data
3714 * again. If the latter fails because the page was NOT uptodate, then we will
3715 * do a thread based blocking retry of the operation. That's the unexpected
3716 * slow path.
3717 */
bcf5a063
JA
3718static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3719 int sync, void *arg)
3720{
3721 struct wait_page_queue *wpq;
3722 struct io_kiocb *req = wait->private;
bcf5a063 3723 struct wait_page_key *key = arg;
bcf5a063
JA
3724
3725 wpq = container_of(wait, struct wait_page_queue, wait);
3726
cdc8fcb4
LT
3727 if (!wake_page_match(wpq, key))
3728 return 0;
3729
c8d317aa 3730 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
bcf5a063 3731 list_del_init(&wait->entry);
921b9054 3732 io_req_task_queue(req);
bcf5a063
JA
3733 return 1;
3734}
3735
c1dd91d1
JA
3736/*
3737 * This controls whether a given IO request should be armed for async page
3738 * based retry. If we return false here, the request is handed to the async
3739 * worker threads for retry. If we're doing buffered reads on a regular file,
3740 * we prepare a private wait_page_queue entry and retry the operation. This
3741 * will either succeed because the page is now uptodate and unlocked, or it
3742 * will register a callback when the page is unlocked at IO completion. Through
3743 * that callback, io_uring uses task_work to setup a retry of the operation.
3744 * That retry will attempt the buffered read again. The retry will generally
3745 * succeed, or in rare cases where it fails, we then fall back to using the
3746 * async worker threads for a blocking retry.
3747 */
227c0c96 3748static bool io_rw_should_retry(struct io_kiocb *req)
f67676d1 3749{
e8c2bc1f
JA
3750 struct io_async_rw *rw = req->async_data;
3751 struct wait_page_queue *wait = &rw->wpq;
bcf5a063 3752 struct kiocb *kiocb = &req->rw.kiocb;
f67676d1 3753
bcf5a063
JA
3754 /* never retry for NOWAIT, we just complete with -EAGAIN */
3755 if (req->flags & REQ_F_NOWAIT)
3756 return false;
f67676d1 3757
227c0c96 3758 /* Only for buffered IO */
3b2a4439 3759 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
bcf5a063 3760 return false;
3b2a4439 3761
bcf5a063
JA
3762 /*
3763 * just use poll if we can, and don't attempt if the fs doesn't
3764 * support callback based unlocks
3765 */
3766 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3767 return false;
f67676d1 3768
3b2a4439
JA
3769 wait->wait.func = io_async_buf_func;
3770 wait->wait.private = req;
3771 wait->wait.flags = 0;
3772 INIT_LIST_HEAD(&wait->wait.entry);
3773 kiocb->ki_flags |= IOCB_WAITQ;
c8d317aa 3774 kiocb->ki_flags &= ~IOCB_NOWAIT;
3b2a4439 3775 kiocb->ki_waitq = wait;
3b2a4439 3776 return true;
bcf5a063
JA
3777}
3778
aeab9506 3779static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
bcf5a063 3780{
607b6fb8 3781 if (likely(req->file->f_op->read_iter))
bcf5a063 3782 return call_read_iter(req->file, &req->rw.kiocb, iter);
2dd2111d 3783 else if (req->file->f_op->read)
4017eb91 3784 return loop_rw_iter(READ, req, iter);
2dd2111d
GH
3785 else
3786 return -EINVAL;
f67676d1
JA
3787}
3788
7db30437
ML
3789static bool need_read_all(struct io_kiocb *req)
3790{
3791 return req->flags & REQ_F_ISREG ||
3792 S_ISBLK(file_inode(req->file)->i_mode);
3793}
3794
584b0180
JA
3795static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
3796{
3797 struct kiocb *kiocb = &req->rw.kiocb;
3798 struct io_ring_ctx *ctx = req->ctx;
3799 struct file *file = req->file;
3800 int ret;
3801
3802 if (unlikely(!file || !(file->f_mode & mode)))
3803 return -EBADF;
3804
3805 if (!io_req_ffs_set(req))
3806 req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
3807
3808 kiocb->ki_flags = iocb_flags(file);
3809 ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
3810 if (unlikely(ret))
3811 return ret;
3812
3813 /*
3814 * If the file is marked O_NONBLOCK, still allow retry for it if it
3815 * supports async. Otherwise it's impossible to use O_NONBLOCK files
3816 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
3817 */
3818 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
3819 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
3820 req->flags |= REQ_F_NOWAIT;
3821
3822 if (ctx->flags & IORING_SETUP_IOPOLL) {
3823 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
3824 return -EOPNOTSUPP;
3825
3826 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
3827 kiocb->ki_complete = io_complete_rw_iopoll;
3828 req->iopoll_completed = 0;
3829 } else {
3830 if (kiocb->ki_flags & IOCB_HIPRI)
3831 return -EINVAL;
3832 kiocb->ki_complete = io_complete_rw;
3833 }
3834
3835 return 0;
3836}
3837
889fca73 3838static int io_read(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 3839{
607b6fb8 3840 struct io_rw_state __s, *s = &__s;
c88598a9 3841 struct iovec *iovec;
9adbd45d 3842 struct kiocb *kiocb = &req->rw.kiocb;
45d189c6 3843 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
d886e185 3844 struct io_async_rw *rw;
cd658695 3845 ssize_t ret, ret2;
b4aec400 3846 loff_t *ppos;
ff6165b2 3847
607b6fb8
PB
3848 if (!req_has_async_data(req)) {
3849 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3850 if (unlikely(ret < 0))
3851 return ret;
3852 } else {
2be2eb02
JA
3853 /*
3854 * Safe and required to re-import if we're using provided
3855 * buffers, as we dropped the selected one before retry.
3856 */
3857 if (req->flags & REQ_F_BUFFER_SELECT) {
3858 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3859 if (unlikely(ret < 0))
3860 return ret;
3861 }
3862
d886e185 3863 rw = req->async_data;
c88598a9 3864 s = &rw->s;
cd658695
JA
3865 /*
3866 * We come here from an earlier attempt, restore our state to
3867 * match in case it doesn't. It's cheap enough that we don't
3868 * need to make this conditional.
3869 */
c88598a9 3870 iov_iter_restore(&s->iter, &s->iter_state);
2846c481 3871 iovec = NULL;
2846c481 3872 }
584b0180 3873 ret = io_rw_init_file(req, FMODE_READ);
323b190b
JA
3874 if (unlikely(ret)) {
3875 kfree(iovec);
584b0180 3876 return ret;
323b190b 3877 }
cef216fc 3878 req->cqe.res = iov_iter_count(&s->iter);
2b188cc1 3879
607b6fb8
PB
3880 if (force_nonblock) {
3881 /* If the file doesn't support async, just async punt */
35645ac3 3882 if (unlikely(!io_file_supports_nowait(req))) {
607b6fb8
PB
3883 ret = io_setup_async_rw(req, iovec, s, true);
3884 return ret ?: -EAGAIN;
3885 }
a88fc400 3886 kiocb->ki_flags |= IOCB_NOWAIT;
607b6fb8
PB
3887 } else {
3888 /* Ensure we clear previously set non-block flag */
3889 kiocb->ki_flags &= ~IOCB_NOWAIT;
6713e7a6 3890 }
9e645e11 3891
b4aec400 3892 ppos = io_kiocb_update_pos(req);
d34e1e5b 3893
cef216fc 3894 ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
5ea5dd45
PB
3895 if (unlikely(ret)) {
3896 kfree(iovec);
3897 return ret;
3898 }
2b188cc1 3899
c88598a9 3900 ret = io_iter_do_read(req, &s->iter);
32960613 3901
230d50d4 3902 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
6ad7f233 3903 req->flags &= ~REQ_F_REISSUE;
9af177ee
JA
3904 /* if we can poll, just do that */
3905 if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
3906 return -EAGAIN;
eefdf30f
JA
3907 /* IOPOLL retry should happen for io-wq threads */
3908 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
f91daf56 3909 goto done;
75c668cd
PB
3910 /* no retry on NONBLOCK nor RWF_NOWAIT */
3911 if (req->flags & REQ_F_NOWAIT)
355afaeb 3912 goto done;
f38c7e3a 3913 ret = 0;
230d50d4
JA
3914 } else if (ret == -EIOCBQUEUED) {
3915 goto out_free;
cef216fc 3916 } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
7db30437 3917 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
7335e3bf 3918 /* read all, failed, already did sync or don't want to retry */
00d23d51 3919 goto done;
227c0c96
JA
3920 }
3921
cd658695
JA
3922 /*
3923 * Don't depend on the iter state matching what was consumed, or being
3924 * untouched in case of error. Restore it and we'll advance it
3925 * manually if we need to.
3926 */
c88598a9 3927 iov_iter_restore(&s->iter, &s->iter_state);
cd658695 3928
c88598a9 3929 ret2 = io_setup_async_rw(req, iovec, s, true);
6bf985dc
PB
3930 if (ret2)
3931 return ret2;
3932
fe1cdd55 3933 iovec = NULL;
e8c2bc1f 3934 rw = req->async_data;
c88598a9 3935 s = &rw->s;
cd658695
JA
3936 /*
3937 * Now use our persistent iterator and state, if we aren't already.
3938 * We've restored and mapped the iter to match.
3939 */
227c0c96 3940
b23df91b 3941 do {
cd658695
JA
3942 /*
3943 * We end up here because of a partial read, either from
3944 * above or inside this loop. Advance the iter by the bytes
3945 * that were consumed.
3946 */
c88598a9
PB
3947 iov_iter_advance(&s->iter, ret);
3948 if (!iov_iter_count(&s->iter))
cd658695 3949 break;
b23df91b 3950 rw->bytes_done += ret;
c88598a9 3951 iov_iter_save_state(&s->iter, &s->iter_state);
cd658695 3952
b23df91b
PB
3953 /* if we can retry, do so with the callbacks armed */
3954 if (!io_rw_should_retry(req)) {
3955 kiocb->ki_flags &= ~IOCB_WAITQ;
3956 return -EAGAIN;
3957 }
3958
3959 /*
3960 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3961 * we get -EIOCBQUEUED, then we'll get a notification when the
3962 * desired page gets unlocked. We can also get a partial read
3963 * here, and if we do, then just retry at the new offset.
3964 */
c88598a9 3965 ret = io_iter_do_read(req, &s->iter);
b23df91b
PB
3966 if (ret == -EIOCBQUEUED)
3967 return 0;
227c0c96 3968 /* we got some bytes, but not all. retry. */
b5b0ecb7 3969 kiocb->ki_flags &= ~IOCB_WAITQ;
c88598a9 3970 iov_iter_restore(&s->iter, &s->iter_state);
cd658695 3971 } while (ret > 0);
227c0c96 3972done:
2ea537ca 3973 kiocb_done(req, ret, issue_flags);
fe1cdd55
PB
3974out_free:
3975 /* it's faster to check here then delegate to kfree */
3976 if (iovec)
3977 kfree(iovec);
5ea5dd45 3978 return 0;
2b188cc1
JA
3979}
3980
889fca73 3981static int io_write(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 3982{
607b6fb8 3983 struct io_rw_state __s, *s = &__s;
c88598a9 3984 struct iovec *iovec;
9adbd45d 3985 struct kiocb *kiocb = &req->rw.kiocb;
45d189c6 3986 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
cd658695 3987 ssize_t ret, ret2;
b4aec400 3988 loff_t *ppos;
2b188cc1 3989
607b6fb8 3990 if (!req_has_async_data(req)) {
5e49c973
PB
3991 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
3992 if (unlikely(ret < 0))
2846c481 3993 return ret;
607b6fb8
PB
3994 } else {
3995 struct io_async_rw *rw = req->async_data;
3996
3997 s = &rw->s;
3998 iov_iter_restore(&s->iter, &s->iter_state);
2846c481 3999 iovec = NULL;
2846c481 4000 }
584b0180 4001 ret = io_rw_init_file(req, FMODE_WRITE);
323b190b
JA
4002 if (unlikely(ret)) {
4003 kfree(iovec);
584b0180 4004 return ret;
323b190b 4005 }
cef216fc 4006 req->cqe.res = iov_iter_count(&s->iter);
2b188cc1 4007
607b6fb8
PB
4008 if (force_nonblock) {
4009 /* If the file doesn't support async, just async punt */
35645ac3 4010 if (unlikely(!io_file_supports_nowait(req)))
607b6fb8 4011 goto copy_iov;
fd6c2e4c 4012
607b6fb8
PB
4013 /* file path doesn't support NOWAIT for non-direct_IO */
4014 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
4015 (req->flags & REQ_F_ISREG))
4016 goto copy_iov;
31b51510 4017
607b6fb8
PB
4018 kiocb->ki_flags |= IOCB_NOWAIT;
4019 } else {
4020 /* Ensure we clear previously set non-block flag */
4021 kiocb->ki_flags &= ~IOCB_NOWAIT;
4022 }
31b51510 4023
b4aec400 4024 ppos = io_kiocb_update_pos(req);
d34e1e5b 4025
cef216fc 4026 ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
fa15bafb
PB
4027 if (unlikely(ret))
4028 goto out_free;
4ed734b0 4029
fa15bafb
PB
4030 /*
4031 * Open-code file_start_write here to grab freeze protection,
4032 * which will be released by another thread in
4033 * io_complete_rw(). Fool lockdep by telling it the lock got
4034 * released so that it doesn't complain about the held lock when
4035 * we return to userspace.
4036 */
4037 if (req->flags & REQ_F_ISREG) {
8a3c84b6 4038 sb_start_write(file_inode(req->file)->i_sb);
fa15bafb
PB
4039 __sb_writers_release(file_inode(req->file)->i_sb,
4040 SB_FREEZE_WRITE);
4041 }
4042 kiocb->ki_flags |= IOCB_WRITE;
4ed734b0 4043
35645ac3 4044 if (likely(req->file->f_op->write_iter))
c88598a9 4045 ret2 = call_write_iter(req->file, kiocb, &s->iter);
2dd2111d 4046 else if (req->file->f_op->write)
c88598a9 4047 ret2 = loop_rw_iter(WRITE, req, &s->iter);
2dd2111d
GH
4048 else
4049 ret2 = -EINVAL;
4ed734b0 4050
6ad7f233
PB
4051 if (req->flags & REQ_F_REISSUE) {
4052 req->flags &= ~REQ_F_REISSUE;
230d50d4 4053 ret2 = -EAGAIN;
6ad7f233 4054 }
230d50d4 4055
fa15bafb
PB
4056 /*
4057 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
4058 * retry them without IOCB_NOWAIT.
4059 */
4060 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
4061 ret2 = -EAGAIN;
75c668cd
PB
4062 /* no retry on NONBLOCK nor RWF_NOWAIT */
4063 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
355afaeb 4064 goto done;
fa15bafb 4065 if (!force_nonblock || ret2 != -EAGAIN) {
eefdf30f 4066 /* IOPOLL retry should happen for io-wq threads */
b10841c9 4067 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
eefdf30f 4068 goto copy_iov;
355afaeb 4069done:
2ea537ca 4070 kiocb_done(req, ret2, issue_flags);
fa15bafb 4071 } else {
f67676d1 4072copy_iov:
c88598a9
PB
4073 iov_iter_restore(&s->iter, &s->iter_state);
4074 ret = io_setup_async_rw(req, iovec, s, false);
6bf985dc 4075 return ret ?: -EAGAIN;
2b188cc1 4076 }
31b51510 4077out_free:
f261c168 4078 /* it's reportedly faster than delegating the null check to kfree() */
252917c3 4079 if (iovec)
6f2cc166 4080 kfree(iovec);
2b188cc1
JA
4081 return ret;
4082}
4083
80a261fd
JA
4084static int io_renameat_prep(struct io_kiocb *req,
4085 const struct io_uring_sqe *sqe)
4086{
4087 struct io_rename *ren = &req->rename;
4088 const char __user *oldf, *newf;
4089
ed7eb259
JA
4090 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4091 return -EINVAL;
26578cda 4092 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
ed7eb259 4093 return -EINVAL;
80a261fd
JA
4094 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4095 return -EBADF;
4096
4097 ren->old_dfd = READ_ONCE(sqe->fd);
4098 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4099 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4100 ren->new_dfd = READ_ONCE(sqe->len);
4101 ren->flags = READ_ONCE(sqe->rename_flags);
4102
4103 ren->oldpath = getname(oldf);
4104 if (IS_ERR(ren->oldpath))
4105 return PTR_ERR(ren->oldpath);
4106
4107 ren->newpath = getname(newf);
4108 if (IS_ERR(ren->newpath)) {
4109 putname(ren->oldpath);
4110 return PTR_ERR(ren->newpath);
4111 }
4112
4113 req->flags |= REQ_F_NEED_CLEANUP;
4114 return 0;
4115}
4116
45d189c6 4117static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
80a261fd
JA
4118{
4119 struct io_rename *ren = &req->rename;
4120 int ret;
4121
45d189c6 4122 if (issue_flags & IO_URING_F_NONBLOCK)
80a261fd
JA
4123 return -EAGAIN;
4124
4125 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
4126 ren->newpath, ren->flags);
4127
4128 req->flags &= ~REQ_F_NEED_CLEANUP;
4129 if (ret < 0)
93d2bcd2 4130 req_set_fail(req);
80a261fd
JA
4131 io_req_complete(req, ret);
4132 return 0;
4133}
4134
14a1143b
JA
4135static int io_unlinkat_prep(struct io_kiocb *req,
4136 const struct io_uring_sqe *sqe)
4137{
4138 struct io_unlink *un = &req->unlink;
4139 const char __user *fname;
4140
22634bc5
JA
4141 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4142 return -EINVAL;
26578cda
PB
4143 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4144 sqe->splice_fd_in)
22634bc5 4145 return -EINVAL;
14a1143b
JA
4146 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4147 return -EBADF;
4148
4149 un->dfd = READ_ONCE(sqe->fd);
4150
4151 un->flags = READ_ONCE(sqe->unlink_flags);
4152 if (un->flags & ~AT_REMOVEDIR)
4153 return -EINVAL;
4154
4155 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4156 un->filename = getname(fname);
4157 if (IS_ERR(un->filename))
4158 return PTR_ERR(un->filename);
4159
4160 req->flags |= REQ_F_NEED_CLEANUP;
4161 return 0;
4162}
4163
45d189c6 4164static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
14a1143b
JA
4165{
4166 struct io_unlink *un = &req->unlink;
4167 int ret;
4168
45d189c6 4169 if (issue_flags & IO_URING_F_NONBLOCK)
14a1143b
JA
4170 return -EAGAIN;
4171
4172 if (un->flags & AT_REMOVEDIR)
4173 ret = do_rmdir(un->dfd, un->filename);
4174 else
4175 ret = do_unlinkat(un->dfd, un->filename);
4176
4177 req->flags &= ~REQ_F_NEED_CLEANUP;
4178 if (ret < 0)
93d2bcd2 4179 req_set_fail(req);
14a1143b
JA
4180 io_req_complete(req, ret);
4181 return 0;
4182}
4183
e34a02dc
DK
4184static int io_mkdirat_prep(struct io_kiocb *req,
4185 const struct io_uring_sqe *sqe)
4186{
4187 struct io_mkdir *mkd = &req->mkdir;
4188 const char __user *fname;
4189
4190 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4191 return -EINVAL;
4192 if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
4193 sqe->splice_fd_in)
4194 return -EINVAL;
4195 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4196 return -EBADF;
4197
4198 mkd->dfd = READ_ONCE(sqe->fd);
4199 mkd->mode = READ_ONCE(sqe->len);
4200
4201 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4202 mkd->filename = getname(fname);
4203 if (IS_ERR(mkd->filename))
4204 return PTR_ERR(mkd->filename);
4205
4206 req->flags |= REQ_F_NEED_CLEANUP;
4207 return 0;
4208}
4209
04f34081 4210static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
e34a02dc
DK
4211{
4212 struct io_mkdir *mkd = &req->mkdir;
4213 int ret;
4214
4215 if (issue_flags & IO_URING_F_NONBLOCK)
4216 return -EAGAIN;
4217
4218 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
4219
4220 req->flags &= ~REQ_F_NEED_CLEANUP;
4221 if (ret < 0)
4222 req_set_fail(req);
4223 io_req_complete(req, ret);
4224 return 0;
4225}
4226
7a8721f8
DK
4227static int io_symlinkat_prep(struct io_kiocb *req,
4228 const struct io_uring_sqe *sqe)
4229{
4230 struct io_symlink *sl = &req->symlink;
4231 const char __user *oldpath, *newpath;
4232
4233 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4234 return -EINVAL;
4235 if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
4236 sqe->splice_fd_in)
4237 return -EINVAL;
4238 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4239 return -EBADF;
4240
4241 sl->new_dfd = READ_ONCE(sqe->fd);
4242 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
4243 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4244
4245 sl->oldpath = getname(oldpath);
4246 if (IS_ERR(sl->oldpath))
4247 return PTR_ERR(sl->oldpath);
4248
4249 sl->newpath = getname(newpath);
4250 if (IS_ERR(sl->newpath)) {
4251 putname(sl->oldpath);
4252 return PTR_ERR(sl->newpath);
4253 }
4254
4255 req->flags |= REQ_F_NEED_CLEANUP;
4256 return 0;
4257}
4258
04f34081 4259static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
7a8721f8
DK
4260{
4261 struct io_symlink *sl = &req->symlink;
4262 int ret;
4263
4264 if (issue_flags & IO_URING_F_NONBLOCK)
4265 return -EAGAIN;
4266
4267 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
4268
4269 req->flags &= ~REQ_F_NEED_CLEANUP;
4270 if (ret < 0)
4271 req_set_fail(req);
4272 io_req_complete(req, ret);
4273 return 0;
4274}
4275
cf30da90
DK
4276static int io_linkat_prep(struct io_kiocb *req,
4277 const struct io_uring_sqe *sqe)
4278{
4279 struct io_hardlink *lnk = &req->hardlink;
4280 const char __user *oldf, *newf;
4281
4282 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4283 return -EINVAL;
4284 if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
4285 return -EINVAL;
4286 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4287 return -EBADF;
4288
4289 lnk->old_dfd = READ_ONCE(sqe->fd);
4290 lnk->new_dfd = READ_ONCE(sqe->len);
4291 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4292 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4293 lnk->flags = READ_ONCE(sqe->hardlink_flags);
4294
4295 lnk->oldpath = getname(oldf);
4296 if (IS_ERR(lnk->oldpath))
4297 return PTR_ERR(lnk->oldpath);
4298
4299 lnk->newpath = getname(newf);
4300 if (IS_ERR(lnk->newpath)) {
4301 putname(lnk->oldpath);
4302 return PTR_ERR(lnk->newpath);
4303 }
4304
4305 req->flags |= REQ_F_NEED_CLEANUP;
4306 return 0;
4307}
4308
04f34081 4309static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
cf30da90
DK
4310{
4311 struct io_hardlink *lnk = &req->hardlink;
4312 int ret;
4313
4314 if (issue_flags & IO_URING_F_NONBLOCK)
4315 return -EAGAIN;
4316
4317 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
4318 lnk->newpath, lnk->flags);
4319
4320 req->flags &= ~REQ_F_NEED_CLEANUP;
4321 if (ret < 0)
4322 req_set_fail(req);
4323 io_req_complete(req, ret);
4324 return 0;
4325}
4326
36f4fa68
JA
4327static int io_shutdown_prep(struct io_kiocb *req,
4328 const struct io_uring_sqe *sqe)
4329{
4330#if defined(CONFIG_NET)
4331 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4332 return -EINVAL;
26578cda
PB
4333 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
4334 sqe->buf_index || sqe->splice_fd_in))
36f4fa68
JA
4335 return -EINVAL;
4336
4337 req->shutdown.how = READ_ONCE(sqe->len);
4338 return 0;
4339#else
4340 return -EOPNOTSUPP;
4341#endif
4342}
4343
45d189c6 4344static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
36f4fa68
JA
4345{
4346#if defined(CONFIG_NET)
4347 struct socket *sock;
4348 int ret;
4349
45d189c6 4350 if (issue_flags & IO_URING_F_NONBLOCK)
36f4fa68
JA
4351 return -EAGAIN;
4352
48aba79b 4353 sock = sock_from_file(req->file);
36f4fa68 4354 if (unlikely(!sock))
48aba79b 4355 return -ENOTSOCK;
36f4fa68
JA
4356
4357 ret = __sys_shutdown_sock(sock, req->shutdown.how);
a146468d 4358 if (ret < 0)
93d2bcd2 4359 req_set_fail(req);
36f4fa68
JA
4360 io_req_complete(req, ret);
4361 return 0;
4362#else
4363 return -EOPNOTSUPP;
4364#endif
4365}
4366
f2a8d5c7
PB
4367static int __io_splice_prep(struct io_kiocb *req,
4368 const struct io_uring_sqe *sqe)
7d67af2c 4369{
fe7e3257 4370 struct io_splice *sp = &req->splice;
7d67af2c 4371 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
7d67af2c 4372
3232dd02
PB
4373 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4374 return -EINVAL;
7d67af2c 4375
7d67af2c
PB
4376 sp->len = READ_ONCE(sqe->len);
4377 sp->flags = READ_ONCE(sqe->splice_flags);
7d67af2c
PB
4378 if (unlikely(sp->flags & ~valid_flags))
4379 return -EINVAL;
a3e4bc23 4380 sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
7d67af2c
PB
4381 return 0;
4382}
4383
f2a8d5c7
PB
4384static int io_tee_prep(struct io_kiocb *req,
4385 const struct io_uring_sqe *sqe)
4386{
4387 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
4388 return -EINVAL;
4389 return __io_splice_prep(req, sqe);
4390}
4391
45d189c6 4392static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
f2a8d5c7
PB
4393{
4394 struct io_splice *sp = &req->splice;
f2a8d5c7
PB
4395 struct file *out = sp->file_out;
4396 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
a3e4bc23 4397 struct file *in;
f2a8d5c7
PB
4398 long ret = 0;
4399
45d189c6 4400 if (issue_flags & IO_URING_F_NONBLOCK)
f2a8d5c7 4401 return -EAGAIN;
a3e4bc23 4402
5106dd6e 4403 if (sp->flags & SPLICE_F_FD_IN_FIXED)
e9419766 4404 in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
5106dd6e
JA
4405 else
4406 in = io_file_get_normal(req, sp->splice_fd_in);
a3e4bc23
JA
4407 if (!in) {
4408 ret = -EBADF;
4409 goto done;
4410 }
4411
f2a8d5c7
PB
4412 if (sp->len)
4413 ret = do_tee(in, out, sp->len, flags);
4414
e1d767f0
PB
4415 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4416 io_put_file(in);
a3e4bc23 4417done:
f2a8d5c7 4418 if (ret != sp->len)
93d2bcd2 4419 req_set_fail(req);
e1e16097 4420 io_req_complete(req, ret);
f2a8d5c7
PB
4421 return 0;
4422}
4423
4424static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4425{
fe7e3257 4426 struct io_splice *sp = &req->splice;
f2a8d5c7
PB
4427
4428 sp->off_in = READ_ONCE(sqe->splice_off_in);
4429 sp->off_out = READ_ONCE(sqe->off);
4430 return __io_splice_prep(req, sqe);
4431}
4432
45d189c6 4433static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
7d67af2c
PB
4434{
4435 struct io_splice *sp = &req->splice;
7d67af2c
PB
4436 struct file *out = sp->file_out;
4437 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4438 loff_t *poff_in, *poff_out;
a3e4bc23 4439 struct file *in;
c9687426 4440 long ret = 0;
7d67af2c 4441
45d189c6 4442 if (issue_flags & IO_URING_F_NONBLOCK)
2fb3e822 4443 return -EAGAIN;
7d67af2c 4444
5106dd6e 4445 if (sp->flags & SPLICE_F_FD_IN_FIXED)
e9419766 4446 in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
5106dd6e
JA
4447 else
4448 in = io_file_get_normal(req, sp->splice_fd_in);
a3e4bc23
JA
4449 if (!in) {
4450 ret = -EBADF;
4451 goto done;
4452 }
4453
7d67af2c
PB
4454 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4455 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
c9687426 4456
948a7749 4457 if (sp->len)
c9687426 4458 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
7d67af2c 4459
e1d767f0
PB
4460 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4461 io_put_file(in);
a3e4bc23 4462done:
7d67af2c 4463 if (ret != sp->len)
93d2bcd2 4464 req_set_fail(req);
e1e16097 4465 io_req_complete(req, ret);
7d67af2c
PB
4466 return 0;
4467}
4468
2b188cc1
JA
4469/*
4470 * IORING_OP_NOP just posts a completion event, nothing else.
4471 */
889fca73 4472static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1
JA
4473{
4474 struct io_ring_ctx *ctx = req->ctx;
2b188cc1 4475
def596e9
JA
4476 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4477 return -EINVAL;
4478
889fca73 4479 __io_req_complete(req, issue_flags, 0, 0);
2b188cc1
JA
4480 return 0;
4481}
4482
4f57f06c
JA
4483static int io_msg_ring_prep(struct io_kiocb *req,
4484 const struct io_uring_sqe *sqe)
4485{
f3b6a41e
JA
4486 if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
4487 sqe->splice_fd_in || sqe->buf_index || sqe->personality))
4f57f06c
JA
4488 return -EINVAL;
4489
4f57f06c
JA
4490 req->msg.user_data = READ_ONCE(sqe->off);
4491 req->msg.len = READ_ONCE(sqe->len);
4492 return 0;
4493}
4494
4495static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
4496{
4497 struct io_ring_ctx *target_ctx;
4498 struct io_msg *msg = &req->msg;
4f57f06c 4499 bool filled;
3f1d52ab 4500 int ret;
4f57f06c 4501
3f1d52ab
JA
4502 ret = -EBADFD;
4503 if (req->file->f_op != &io_uring_fops)
4504 goto done;
4f57f06c 4505
3f1d52ab 4506 ret = -EOVERFLOW;
4f57f06c
JA
4507 target_ctx = req->file->private_data;
4508
4509 spin_lock(&target_ctx->completion_lock);
7ef66d18 4510 filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
4f57f06c
JA
4511 io_commit_cqring(target_ctx);
4512 spin_unlock(&target_ctx->completion_lock);
4513
4514 if (filled) {
4515 io_cqring_ev_posted(target_ctx);
4516 ret = 0;
4517 }
4518
3f1d52ab 4519done:
9666d420
JA
4520 if (ret < 0)
4521 req_set_fail(req);
4f57f06c
JA
4522 __io_req_complete(req, issue_flags, ret, 0);
4523 return 0;
4524}
4525
1155c76a 4526static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 4527{
6b06314c 4528 struct io_ring_ctx *ctx = req->ctx;
c992fe29 4529
6b06314c 4530 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
def596e9 4531 return -EINVAL;
26578cda
PB
4532 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4533 sqe->splice_fd_in))
c992fe29
CH
4534 return -EINVAL;
4535
8ed8d3c3
JA
4536 req->sync.flags = READ_ONCE(sqe->fsync_flags);
4537 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4538 return -EINVAL;
4539
4540 req->sync.off = READ_ONCE(sqe->off);
4541 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
4542 return 0;
4543}
4544
45d189c6 4545static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 4546{
8ed8d3c3 4547 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
4548 int ret;
4549
ac45abc0 4550 /* fsync always requires a blocking context */
45d189c6 4551 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
4552 return -EAGAIN;
4553
9adbd45d 4554 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
4555 end > 0 ? end : LLONG_MAX,
4556 req->sync.flags & IORING_FSYNC_DATASYNC);
4557 if (ret < 0)
93d2bcd2 4558 req_set_fail(req);
e1e16097 4559 io_req_complete(req, ret);
c992fe29
CH
4560 return 0;
4561}
4562
d63d1b5e
JA
4563static int io_fallocate_prep(struct io_kiocb *req,
4564 const struct io_uring_sqe *sqe)
4565{
26578cda
PB
4566 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4567 sqe->splice_fd_in)
d63d1b5e 4568 return -EINVAL;
3232dd02
PB
4569 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4570 return -EINVAL;
d63d1b5e
JA
4571
4572 req->sync.off = READ_ONCE(sqe->off);
4573 req->sync.len = READ_ONCE(sqe->addr);
4574 req->sync.mode = READ_ONCE(sqe->len);
4575 return 0;
4576}
4577
45d189c6 4578static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
5d17b4a4 4579{
ac45abc0
PB
4580 int ret;
4581
d63d1b5e 4582 /* fallocate always requiring blocking context */
45d189c6 4583 if (issue_flags & IO_URING_F_NONBLOCK)
5d17b4a4 4584 return -EAGAIN;
ac45abc0
PB
4585 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4586 req->sync.len);
ac45abc0 4587 if (ret < 0)
93d2bcd2 4588 req_set_fail(req);
f63cf519
JA
4589 else
4590 fsnotify_modify(req->file);
e1e16097 4591 io_req_complete(req, ret);
5d17b4a4
JA
4592 return 0;
4593}
4594
ec65fea5 4595static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 4596{
f8748881 4597 const char __user *fname;
15b71abe 4598 int ret;
b7bb4f7d 4599
d3fddf6d
PB
4600 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4601 return -EINVAL;
b9445598 4602 if (unlikely(sqe->ioprio || sqe->buf_index))
15b71abe 4603 return -EINVAL;
ec65fea5 4604 if (unlikely(req->flags & REQ_F_FIXED_FILE))
cf3040ca 4605 return -EBADF;
03b1230c 4606
ec65fea5
PB
4607 /* open.how should be already initialised */
4608 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
08a1d26e 4609 req->open.how.flags |= O_LARGEFILE;
3529d8c2 4610
25e72d10
PB
4611 req->open.dfd = READ_ONCE(sqe->fd);
4612 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
f8748881 4613 req->open.filename = getname(fname);
15b71abe
JA
4614 if (IS_ERR(req->open.filename)) {
4615 ret = PTR_ERR(req->open.filename);
4616 req->open.filename = NULL;
4617 return ret;
4618 }
b9445598
PB
4619
4620 req->open.file_slot = READ_ONCE(sqe->file_index);
4621 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4622 return -EINVAL;
4623
4022e7af 4624 req->open.nofile = rlimit(RLIMIT_NOFILE);
8fef80bf 4625 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 4626 return 0;
03b1230c
JA
4627}
4628
ec65fea5
PB
4629static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4630{
d3fddf6d
PB
4631 u64 mode = READ_ONCE(sqe->len);
4632 u64 flags = READ_ONCE(sqe->open_flags);
ec65fea5 4633
ec65fea5
PB
4634 req->open.how = build_open_how(flags, mode);
4635 return __io_openat_prep(req, sqe);
4636}
4637
cebdb986 4638static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 4639{
cebdb986 4640 struct open_how __user *how;
cebdb986 4641 size_t len;
0fa03c62
JA
4642 int ret;
4643
cebdb986
JA
4644 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4645 len = READ_ONCE(sqe->len);
cebdb986
JA
4646 if (len < OPEN_HOW_SIZE_VER0)
4647 return -EINVAL;
3529d8c2 4648
cebdb986
JA
4649 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4650 len);
4651 if (ret)
4652 return ret;
3529d8c2 4653
ec65fea5 4654 return __io_openat_prep(req, sqe);
cebdb986
JA
4655}
4656
45d189c6 4657static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
15b71abe
JA
4658{
4659 struct open_flags op;
15b71abe 4660 struct file *file;
b9445598
PB
4661 bool resolve_nonblock, nonblock_set;
4662 bool fixed = !!req->open.file_slot;
15b71abe
JA
4663 int ret;
4664
cebdb986 4665 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
4666 if (ret)
4667 goto err;
3a81fd02
JA
4668 nonblock_set = op.open_flag & O_NONBLOCK;
4669 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
45d189c6 4670 if (issue_flags & IO_URING_F_NONBLOCK) {
3a81fd02
JA
4671 /*
4672 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4673 * it'll always -EAGAIN
4674 */
4675 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4676 return -EAGAIN;
4677 op.lookup_flags |= LOOKUP_CACHED;
4678 op.open_flag |= O_NONBLOCK;
4679 }
15b71abe 4680
b9445598
PB
4681 if (!fixed) {
4682 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4683 if (ret < 0)
4684 goto err;
4685 }
15b71abe
JA
4686
4687 file = do_filp_open(req->open.dfd, req->open.filename, &op);
12dcb58a 4688 if (IS_ERR(file)) {
944d1444 4689 /*
12dcb58a
PB
4690 * We could hang on to this 'fd' on retrying, but seems like
4691 * marginal gain for something that is now known to be a slower
4692 * path. So just put it, and we'll get a new one when we retry.
944d1444 4693 */
b9445598
PB
4694 if (!fixed)
4695 put_unused_fd(ret);
3a81fd02 4696
15b71abe 4697 ret = PTR_ERR(file);
12dcb58a
PB
4698 /* only retry if RESOLVE_CACHED wasn't already set by application */
4699 if (ret == -EAGAIN &&
4700 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4701 return -EAGAIN;
4702 goto err;
15b71abe 4703 }
12dcb58a
PB
4704
4705 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4706 file->f_flags &= ~O_NONBLOCK;
4707 fsnotify_open(file);
b9445598
PB
4708
4709 if (!fixed)
4710 fd_install(ret, file);
4711 else
4712 ret = io_install_fixed_file(req, file, issue_flags,
4713 req->open.file_slot - 1);
15b71abe
JA
4714err:
4715 putname(req->open.filename);
8fef80bf 4716 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe 4717 if (ret < 0)
93d2bcd2 4718 req_set_fail(req);
0bdf3398 4719 __io_req_complete(req, issue_flags, ret, 0);
15b71abe
JA
4720 return 0;
4721}
4722
45d189c6 4723static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
cebdb986 4724{
e45cff58 4725 return io_openat2(req, issue_flags);
cebdb986
JA
4726}
4727
067524e9
JA
4728static int io_remove_buffers_prep(struct io_kiocb *req,
4729 const struct io_uring_sqe *sqe)
4730{
4731 struct io_provide_buf *p = &req->pbuf;
4732 u64 tmp;
4733
26578cda
PB
4734 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4735 sqe->splice_fd_in)
067524e9
JA
4736 return -EINVAL;
4737
4738 tmp = READ_ONCE(sqe->fd);
4739 if (!tmp || tmp > USHRT_MAX)
4740 return -EINVAL;
4741
4742 memset(p, 0, sizeof(*p));
4743 p->nbufs = tmp;
4744 p->bgid = READ_ONCE(sqe->buf_group);
4745 return 0;
4746}
4747
dbc7d452
JA
4748static int __io_remove_buffers(struct io_ring_ctx *ctx,
4749 struct io_buffer_list *bl, unsigned nbufs)
067524e9
JA
4750{
4751 unsigned i = 0;
4752
4753 /* shouldn't happen */
4754 if (!nbufs)
4755 return 0;
4756
4757 /* the head kbuf is the list itself */
dbc7d452 4758 while (!list_empty(&bl->buf_list)) {
067524e9
JA
4759 struct io_buffer *nxt;
4760
dbc7d452 4761 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
067524e9 4762 list_del(&nxt->list);
067524e9
JA
4763 if (++i == nbufs)
4764 return i;
1d0254e6 4765 cond_resched();
067524e9
JA
4766 }
4767 i++;
067524e9
JA
4768
4769 return i;
4770}
4771
889fca73 4772static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
067524e9
JA
4773{
4774 struct io_provide_buf *p = &req->pbuf;
4775 struct io_ring_ctx *ctx = req->ctx;
dbc7d452 4776 struct io_buffer_list *bl;
067524e9
JA
4777 int ret = 0;
4778
f8929630 4779 io_ring_submit_lock(ctx, issue_flags);
067524e9
JA
4780
4781 ret = -ENOENT;
dbc7d452
JA
4782 bl = io_buffer_get_list(ctx, p->bgid);
4783 if (bl)
4784 ret = __io_remove_buffers(ctx, bl, p->nbufs);
067524e9 4785 if (ret < 0)
93d2bcd2 4786 req_set_fail(req);
067524e9 4787
9fb8cb49
PB
4788 /* complete before unlock, IOPOLL may need the lock */
4789 __io_req_complete(req, issue_flags, ret, 0);
f8929630 4790 io_ring_submit_unlock(ctx, issue_flags);
067524e9
JA
4791 return 0;
4792}
4793
ddf0322d
JA
4794static int io_provide_buffers_prep(struct io_kiocb *req,
4795 const struct io_uring_sqe *sqe)
4796{
38134ada 4797 unsigned long size, tmp_check;
ddf0322d
JA
4798 struct io_provide_buf *p = &req->pbuf;
4799 u64 tmp;
4800
26578cda 4801 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
ddf0322d
JA
4802 return -EINVAL;
4803
4804 tmp = READ_ONCE(sqe->fd);
4805 if (!tmp || tmp > USHRT_MAX)
4806 return -E2BIG;
4807 p->nbufs = tmp;
4808 p->addr = READ_ONCE(sqe->addr);
4809 p->len = READ_ONCE(sqe->len);
4810
38134ada
PB
4811 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4812 &size))
4813 return -EOVERFLOW;
4814 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4815 return -EOVERFLOW;
4816
d81269fe
PB
4817 size = (unsigned long)p->len * p->nbufs;
4818 if (!access_ok(u64_to_user_ptr(p->addr), size))
ddf0322d
JA
4819 return -EFAULT;
4820
4821 p->bgid = READ_ONCE(sqe->buf_group);
4822 tmp = READ_ONCE(sqe->off);
4823 if (tmp > USHRT_MAX)
4824 return -E2BIG;
4825 p->bid = tmp;
4826 return 0;
4827}
4828
cc3cec83
JA
4829static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
4830{
4831 struct io_buffer *buf;
4832 struct page *page;
4833 int bufs_in_page;
4834
4835 /*
4836 * Completions that don't happen inline (eg not under uring_lock) will
4837 * add to ->io_buffers_comp. If we don't have any free buffers, check
4838 * the completion list and splice those entries first.
4839 */
4840 if (!list_empty_careful(&ctx->io_buffers_comp)) {
4841 spin_lock(&ctx->completion_lock);
4842 if (!list_empty(&ctx->io_buffers_comp)) {
4843 list_splice_init(&ctx->io_buffers_comp,
4844 &ctx->io_buffers_cache);
4845 spin_unlock(&ctx->completion_lock);
4846 return 0;
4847 }
4848 spin_unlock(&ctx->completion_lock);
4849 }
4850
4851 /*
4852 * No free buffers and no completion entries either. Allocate a new
4853 * page worth of buffer entries and add those to our freelist.
4854 */
4855 page = alloc_page(GFP_KERNEL_ACCOUNT);
4856 if (!page)
4857 return -ENOMEM;
4858
4859 list_add(&page->lru, &ctx->io_buffers_pages);
4860
4861 buf = page_address(page);
4862 bufs_in_page = PAGE_SIZE / sizeof(*buf);
4863 while (bufs_in_page) {
4864 list_add_tail(&buf->list, &ctx->io_buffers_cache);
4865 buf++;
4866 bufs_in_page--;
4867 }
4868
4869 return 0;
4870}
4871
4872static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
dbc7d452 4873 struct io_buffer_list *bl)
ddf0322d
JA
4874{
4875 struct io_buffer *buf;
4876 u64 addr = pbuf->addr;
4877 int i, bid = pbuf->bid;
4878
4879 for (i = 0; i < pbuf->nbufs; i++) {
cc3cec83
JA
4880 if (list_empty(&ctx->io_buffers_cache) &&
4881 io_refill_buffer_cache(ctx))
ddf0322d 4882 break;
cc3cec83
JA
4883 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
4884 list);
dbc7d452 4885 list_move_tail(&buf->list, &bl->buf_list);
ddf0322d 4886 buf->addr = addr;
d1f82808 4887 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
ddf0322d 4888 buf->bid = bid;
b1c62645 4889 buf->bgid = pbuf->bgid;
ddf0322d
JA
4890 addr += pbuf->len;
4891 bid++;
f240762f 4892 cond_resched();
ddf0322d
JA
4893 }
4894
dbc7d452 4895 return i ? 0 : -ENOMEM;
ddf0322d
JA
4896}
4897
889fca73 4898static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
ddf0322d
JA
4899{
4900 struct io_provide_buf *p = &req->pbuf;
4901 struct io_ring_ctx *ctx = req->ctx;
dbc7d452 4902 struct io_buffer_list *bl;
ddf0322d 4903 int ret = 0;
ddf0322d 4904
f8929630 4905 io_ring_submit_lock(ctx, issue_flags);
ddf0322d 4906
dbc7d452
JA
4907 bl = io_buffer_get_list(ctx, p->bgid);
4908 if (unlikely(!bl)) {
4909 bl = kmalloc(sizeof(*bl), GFP_KERNEL);
4910 if (!bl) {
4911 ret = -ENOMEM;
4912 goto err;
4913 }
4914 io_buffer_add_list(ctx, bl, p->bgid);
ddf0322d 4915 }
dbc7d452
JA
4916
4917 ret = io_add_buffers(ctx, p, bl);
4918err:
ddf0322d 4919 if (ret < 0)
93d2bcd2 4920 req_set_fail(req);
9fb8cb49
PB
4921 /* complete before unlock, IOPOLL may need the lock */
4922 __io_req_complete(req, issue_flags, ret, 0);
f8929630 4923 io_ring_submit_unlock(ctx, issue_flags);
ddf0322d 4924 return 0;
cebdb986
JA
4925}
4926
3e4827b0
JA
4927static int io_epoll_ctl_prep(struct io_kiocb *req,
4928 const struct io_uring_sqe *sqe)
4929{
4930#if defined(CONFIG_EPOLL)
26578cda 4931 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3e4827b0 4932 return -EINVAL;
2d74d042 4933 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 4934 return -EINVAL;
3e4827b0
JA
4935
4936 req->epoll.epfd = READ_ONCE(sqe->fd);
4937 req->epoll.op = READ_ONCE(sqe->len);
4938 req->epoll.fd = READ_ONCE(sqe->off);
4939
4940 if (ep_op_has_event(req->epoll.op)) {
4941 struct epoll_event __user *ev;
4942
4943 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4944 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4945 return -EFAULT;
4946 }
4947
4948 return 0;
4949#else
4950 return -EOPNOTSUPP;
4951#endif
4952}
4953
889fca73 4954static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
3e4827b0
JA
4955{
4956#if defined(CONFIG_EPOLL)
4957 struct io_epoll *ie = &req->epoll;
4958 int ret;
45d189c6 4959 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3e4827b0
JA
4960
4961 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4962 if (force_nonblock && ret == -EAGAIN)
4963 return -EAGAIN;
4964
4965 if (ret < 0)
93d2bcd2 4966 req_set_fail(req);
889fca73 4967 __io_req_complete(req, issue_flags, ret, 0);
3e4827b0
JA
4968 return 0;
4969#else
4970 return -EOPNOTSUPP;
4971#endif
4972}
4973
c1ca757b
JA
4974static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4975{
4976#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
26578cda 4977 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
c1ca757b 4978 return -EINVAL;
3232dd02
PB
4979 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4980 return -EINVAL;
c1ca757b
JA
4981
4982 req->madvise.addr = READ_ONCE(sqe->addr);
4983 req->madvise.len = READ_ONCE(sqe->len);
4984 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4985 return 0;
4986#else
4987 return -EOPNOTSUPP;
4988#endif
4989}
4990
45d189c6 4991static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
c1ca757b
JA
4992{
4993#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4994 struct io_madvise *ma = &req->madvise;
4995 int ret;
4996
45d189c6 4997 if (issue_flags & IO_URING_F_NONBLOCK)
c1ca757b
JA
4998 return -EAGAIN;
4999
0726b01e 5000 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
c1ca757b 5001 if (ret < 0)
93d2bcd2 5002 req_set_fail(req);
e1e16097 5003 io_req_complete(req, ret);
c1ca757b
JA
5004 return 0;
5005#else
5006 return -EOPNOTSUPP;
5007#endif
5008}
5009
4840e418
JA
5010static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5011{
26578cda 5012 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4840e418 5013 return -EINVAL;
3232dd02
PB
5014 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5015 return -EINVAL;
4840e418
JA
5016
5017 req->fadvise.offset = READ_ONCE(sqe->off);
5018 req->fadvise.len = READ_ONCE(sqe->len);
5019 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
5020 return 0;
5021}
5022
45d189c6 5023static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4840e418
JA
5024{
5025 struct io_fadvise *fa = &req->fadvise;
5026 int ret;
5027
45d189c6 5028 if (issue_flags & IO_URING_F_NONBLOCK) {
3e69426d
JA
5029 switch (fa->advice) {
5030 case POSIX_FADV_NORMAL:
5031 case POSIX_FADV_RANDOM:
5032 case POSIX_FADV_SEQUENTIAL:
5033 break;
5034 default:
5035 return -EAGAIN;
5036 }
5037 }
4840e418
JA
5038
5039 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
5040 if (ret < 0)
93d2bcd2 5041 req_set_fail(req);
0bdf3398 5042 __io_req_complete(req, issue_flags, ret, 0);
4840e418
JA
5043 return 0;
5044}
5045
eddc7ef5
JA
5046static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5047{
1b6fe6e0
SR
5048 const char __user *path;
5049
2d74d042 5050 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 5051 return -EINVAL;
26578cda 5052 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
eddc7ef5 5053 return -EINVAL;
9c280f90 5054 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 5055 return -EBADF;
eddc7ef5 5056
1d9e1288
BM
5057 req->statx.dfd = READ_ONCE(sqe->fd);
5058 req->statx.mask = READ_ONCE(sqe->len);
1b6fe6e0 5059 path = u64_to_user_ptr(READ_ONCE(sqe->addr));
1d9e1288
BM
5060 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5061 req->statx.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5 5062
1b6fe6e0
SR
5063 req->statx.filename = getname_flags(path,
5064 getname_statx_lookup_flags(req->statx.flags),
5065 NULL);
5066
5067 if (IS_ERR(req->statx.filename)) {
5068 int ret = PTR_ERR(req->statx.filename);
5069
5070 req->statx.filename = NULL;
5071 return ret;
5072 }
5073
5074 req->flags |= REQ_F_NEED_CLEANUP;
eddc7ef5
JA
5075 return 0;
5076}
5077
45d189c6 5078static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
eddc7ef5 5079{
1d9e1288 5080 struct io_statx *ctx = &req->statx;
eddc7ef5
JA
5081 int ret;
5082
59d70013 5083 if (issue_flags & IO_URING_F_NONBLOCK)
eddc7ef5
JA
5084 return -EAGAIN;
5085
e62753e4
BM
5086 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
5087 ctx->buffer);
eddc7ef5 5088
eddc7ef5 5089 if (ret < 0)
93d2bcd2 5090 req_set_fail(req);
e1e16097 5091 io_req_complete(req, ret);
eddc7ef5
JA
5092 return 0;
5093}
5094
b5dba59e
JA
5095static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5096{
14587a46 5097 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3232dd02 5098 return -EINVAL;
b5dba59e 5099 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
7df778be 5100 sqe->rw_flags || sqe->buf_index)
b5dba59e 5101 return -EINVAL;
9c280f90 5102 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 5103 return -EBADF;
b5dba59e
JA
5104
5105 req->close.fd = READ_ONCE(sqe->fd);
7df778be
PB
5106 req->close.file_slot = READ_ONCE(sqe->file_index);
5107 if (req->close.file_slot && req->close.fd)
5108 return -EINVAL;
5109
b5dba59e 5110 return 0;
b5dba59e
JA
5111}
5112
889fca73 5113static int io_close(struct io_kiocb *req, unsigned int issue_flags)
b5dba59e 5114{
9eac1904 5115 struct files_struct *files = current->files;
3af73b28 5116 struct io_close *close = &req->close;
9eac1904 5117 struct fdtable *fdt;
a1fde923
PB
5118 struct file *file = NULL;
5119 int ret = -EBADF;
b5dba59e 5120
7df778be
PB
5121 if (req->close.file_slot) {
5122 ret = io_close_fixed(req, issue_flags);
5123 goto err;
5124 }
5125
9eac1904
JA
5126 spin_lock(&files->file_lock);
5127 fdt = files_fdtable(files);
5128 if (close->fd >= fdt->max_fds) {
5129 spin_unlock(&files->file_lock);
5130 goto err;
5131 }
5132 file = fdt->fd[close->fd];
a1fde923 5133 if (!file || file->f_op == &io_uring_fops) {
9eac1904
JA
5134 spin_unlock(&files->file_lock);
5135 file = NULL;
5136 goto err;
3af73b28 5137 }
b5dba59e
JA
5138
5139 /* if the file has a flush method, be safe and punt to async */
45d189c6 5140 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
9eac1904 5141 spin_unlock(&files->file_lock);
0bf0eefd 5142 return -EAGAIN;
a2100672 5143 }
b5dba59e 5144
9eac1904
JA
5145 ret = __close_fd_get_file(close->fd, &file);
5146 spin_unlock(&files->file_lock);
5147 if (ret < 0) {
5148 if (ret == -ENOENT)
5149 ret = -EBADF;
5150 goto err;
5151 }
5152
3af73b28 5153 /* No ->flush() or already async, safely close from here */
9eac1904
JA
5154 ret = filp_close(file, current->files);
5155err:
3af73b28 5156 if (ret < 0)
93d2bcd2 5157 req_set_fail(req);
9eac1904
JA
5158 if (file)
5159 fput(file);
889fca73 5160 __io_req_complete(req, issue_flags, ret, 0);
1a417f4e 5161 return 0;
b5dba59e
JA
5162}
5163
1155c76a 5164static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4
JA
5165{
5166 struct io_ring_ctx *ctx = req->ctx;
5d17b4a4 5167
5d17b4a4
JA
5168 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
5169 return -EINVAL;
26578cda
PB
5170 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
5171 sqe->splice_fd_in))
5d17b4a4
JA
5172 return -EINVAL;
5173
8ed8d3c3
JA
5174 req->sync.off = READ_ONCE(sqe->off);
5175 req->sync.len = READ_ONCE(sqe->len);
5176 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
5177 return 0;
5178}
5179
45d189c6 5180static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 5181{
8ed8d3c3
JA
5182 int ret;
5183
ac45abc0 5184 /* sync_file_range always requires a blocking context */
45d189c6 5185 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
5186 return -EAGAIN;
5187
9adbd45d 5188 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
5189 req->sync.flags);
5190 if (ret < 0)
93d2bcd2 5191 req_set_fail(req);
e1e16097 5192 io_req_complete(req, ret);
5d17b4a4
JA
5193 return 0;
5194}
5195
469956e8 5196#if defined(CONFIG_NET)
02d27d89
PB
5197static int io_setup_async_msg(struct io_kiocb *req,
5198 struct io_async_msghdr *kmsg)
5199{
e8c2bc1f
JA
5200 struct io_async_msghdr *async_msg = req->async_data;
5201
5202 if (async_msg)
02d27d89 5203 return -EAGAIN;
e8c2bc1f 5204 if (io_alloc_async_data(req)) {
257e84a5 5205 kfree(kmsg->free_iov);
02d27d89
PB
5206 return -ENOMEM;
5207 }
e8c2bc1f 5208 async_msg = req->async_data;
02d27d89 5209 req->flags |= REQ_F_NEED_CLEANUP;
e8c2bc1f 5210 memcpy(async_msg, kmsg, sizeof(*kmsg));
2a780802 5211 async_msg->msg.msg_name = &async_msg->addr;
257e84a5
PB
5212 /* if were using fast_iov, set it to the new one */
5213 if (!async_msg->free_iov)
5214 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
5215
02d27d89
PB
5216 return -EAGAIN;
5217}
5218
2ae523ed
PB
5219static int io_sendmsg_copy_hdr(struct io_kiocb *req,
5220 struct io_async_msghdr *iomsg)
5221{
2ae523ed 5222 iomsg->msg.msg_name = &iomsg->addr;
257e84a5 5223 iomsg->free_iov = iomsg->fast_iov;
2ae523ed 5224 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
257e84a5 5225 req->sr_msg.msg_flags, &iomsg->free_iov);
2ae523ed
PB
5226}
5227
93642ef8
PB
5228static int io_sendmsg_prep_async(struct io_kiocb *req)
5229{
5230 int ret;
5231
93642ef8
PB
5232 ret = io_sendmsg_copy_hdr(req, req->async_data);
5233 if (!ret)
5234 req->flags |= REQ_F_NEED_CLEANUP;
5235 return ret;
5236}
5237
3529d8c2 5238static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 5239{
e47293fd 5240 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 5241
d2b6f48b
PB
5242 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5243 return -EINVAL;
5244
270a5940 5245 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 5246 sr->len = READ_ONCE(sqe->len);
04411806
PB
5247 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5248 if (sr->msg_flags & MSG_DONTWAIT)
5249 req->flags |= REQ_F_NOWAIT;
3529d8c2 5250
d8768362
JA
5251#ifdef CONFIG_COMPAT
5252 if (req->ctx->compat)
5253 sr->msg_flags |= MSG_CMSG_COMPAT;
5254#endif
93642ef8 5255 return 0;
03b1230c
JA
5256}
5257
889fca73 5258static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 5259{
6b754c8b 5260 struct io_async_msghdr iomsg, *kmsg;
0fa03c62 5261 struct socket *sock;
7a7cacba 5262 unsigned flags;
0031275d 5263 int min_ret = 0;
0fa03c62
JA
5264 int ret;
5265
dba4a925 5266 sock = sock_from_file(req->file);
7a7cacba 5267 if (unlikely(!sock))
dba4a925 5268 return -ENOTSOCK;
3529d8c2 5269
d886e185
PB
5270 if (req_has_async_data(req)) {
5271 kmsg = req->async_data;
5272 } else {
7a7cacba
PB
5273 ret = io_sendmsg_copy_hdr(req, &iomsg);
5274 if (ret)
5275 return ret;
5276 kmsg = &iomsg;
0fa03c62 5277 }
0fa03c62 5278
04411806
PB
5279 flags = req->sr_msg.msg_flags;
5280 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 5281 flags |= MSG_DONTWAIT;
0031275d
SM
5282 if (flags & MSG_WAITALL)
5283 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5284
7a7cacba 5285 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
0fa03c62 5286
7297ce3d
PB
5287 if (ret < min_ret) {
5288 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
5289 return io_setup_async_msg(req, kmsg);
5290 if (ret == -ERESTARTSYS)
5291 ret = -EINTR;
5292 req_set_fail(req);
5293 }
257e84a5
PB
5294 /* fast path, check for non-NULL to avoid function call */
5295 if (kmsg->free_iov)
5296 kfree(kmsg->free_iov);
99bc4c38 5297 req->flags &= ~REQ_F_NEED_CLEANUP;
889fca73 5298 __io_req_complete(req, issue_flags, ret, 0);
5d17b4a4 5299 return 0;
03b1230c 5300}
aa1fa28f 5301
889fca73 5302static int io_send(struct io_kiocb *req, unsigned int issue_flags)
fddaface 5303{
7a7cacba
PB
5304 struct io_sr_msg *sr = &req->sr_msg;
5305 struct msghdr msg;
5306 struct iovec iov;
fddaface 5307 struct socket *sock;
7a7cacba 5308 unsigned flags;
0031275d 5309 int min_ret = 0;
fddaface
JA
5310 int ret;
5311
dba4a925 5312 sock = sock_from_file(req->file);
7a7cacba 5313 if (unlikely(!sock))
dba4a925 5314 return -ENOTSOCK;
fddaface 5315
7a7cacba
PB
5316 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
5317 if (unlikely(ret))
14db8411 5318 return ret;
fddaface 5319
7a7cacba
PB
5320 msg.msg_name = NULL;
5321 msg.msg_control = NULL;
5322 msg.msg_controllen = 0;
5323 msg.msg_namelen = 0;
fddaface 5324
04411806
PB
5325 flags = req->sr_msg.msg_flags;
5326 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 5327 flags |= MSG_DONTWAIT;
0031275d
SM
5328 if (flags & MSG_WAITALL)
5329 min_ret = iov_iter_count(&msg.msg_iter);
5330
7a7cacba
PB
5331 msg.msg_flags = flags;
5332 ret = sock_sendmsg(sock, &msg);
7297ce3d
PB
5333 if (ret < min_ret) {
5334 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
5335 return -EAGAIN;
5336 if (ret == -ERESTARTSYS)
5337 ret = -EINTR;
93d2bcd2 5338 req_set_fail(req);
7297ce3d 5339 }
889fca73 5340 __io_req_complete(req, issue_flags, ret, 0);
fddaface 5341 return 0;
fddaface
JA
5342}
5343
1400e697
PB
5344static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
5345 struct io_async_msghdr *iomsg)
52de1fe1
JA
5346{
5347 struct io_sr_msg *sr = &req->sr_msg;
5348 struct iovec __user *uiov;
5349 size_t iov_len;
5350 int ret;
5351
1400e697
PB
5352 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
5353 &iomsg->uaddr, &uiov, &iov_len);
52de1fe1
JA
5354 if (ret)
5355 return ret;
5356
5357 if (req->flags & REQ_F_BUFFER_SELECT) {
5358 if (iov_len > 1)
5359 return -EINVAL;
5476dfed 5360 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
52de1fe1 5361 return -EFAULT;
5476dfed 5362 sr->len = iomsg->fast_iov[0].iov_len;
257e84a5 5363 iomsg->free_iov = NULL;
52de1fe1 5364 } else {
257e84a5 5365 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 5366 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
257e84a5 5367 &iomsg->free_iov, &iomsg->msg.msg_iter,
89cd35c5 5368 false);
52de1fe1
JA
5369 if (ret > 0)
5370 ret = 0;
5371 }
5372
5373 return ret;
5374}
5375
5376#ifdef CONFIG_COMPAT
5377static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
1400e697 5378 struct io_async_msghdr *iomsg)
52de1fe1 5379{
52de1fe1
JA
5380 struct io_sr_msg *sr = &req->sr_msg;
5381 struct compat_iovec __user *uiov;
5382 compat_uptr_t ptr;
5383 compat_size_t len;
5384 int ret;
5385
4af3417a
PB
5386 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
5387 &ptr, &len);
52de1fe1
JA
5388 if (ret)
5389 return ret;
5390
5391 uiov = compat_ptr(ptr);
5392 if (req->flags & REQ_F_BUFFER_SELECT) {
5393 compat_ssize_t clen;
5394
5395 if (len > 1)
5396 return -EINVAL;
5397 if (!access_ok(uiov, sizeof(*uiov)))
5398 return -EFAULT;
5399 if (__get_user(clen, &uiov->iov_len))
5400 return -EFAULT;
5401 if (clen < 0)
5402 return -EINVAL;
2d280bc8 5403 sr->len = clen;
257e84a5 5404 iomsg->free_iov = NULL;
52de1fe1 5405 } else {
257e84a5 5406 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 5407 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
257e84a5 5408 UIO_FASTIOV, &iomsg->free_iov,
89cd35c5 5409 &iomsg->msg.msg_iter, true);
52de1fe1
JA
5410 if (ret < 0)
5411 return ret;
5412 }
5413
5414 return 0;
5415}
5416#endif
5417
1400e697
PB
5418static int io_recvmsg_copy_hdr(struct io_kiocb *req,
5419 struct io_async_msghdr *iomsg)
52de1fe1 5420{
1400e697 5421 iomsg->msg.msg_name = &iomsg->addr;
52de1fe1
JA
5422
5423#ifdef CONFIG_COMPAT
5424 if (req->ctx->compat)
1400e697 5425 return __io_compat_recvmsg_copy_hdr(req, iomsg);
fddaface 5426#endif
52de1fe1 5427
1400e697 5428 return __io_recvmsg_copy_hdr(req, iomsg);
52de1fe1
JA
5429}
5430
bcda7baa 5431static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
51aac424 5432 unsigned int issue_flags)
bcda7baa
JA
5433{
5434 struct io_sr_msg *sr = &req->sr_msg;
bcda7baa 5435
51aac424 5436 return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
fddaface
JA
5437}
5438
93642ef8 5439static int io_recvmsg_prep_async(struct io_kiocb *req)
aa1fa28f 5440{
99bc4c38 5441 int ret;
3529d8c2 5442
93642ef8
PB
5443 ret = io_recvmsg_copy_hdr(req, req->async_data);
5444 if (!ret)
5445 req->flags |= REQ_F_NEED_CLEANUP;
5446 return ret;
5447}
5448
5449static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5450{
5451 struct io_sr_msg *sr = &req->sr_msg;
5452
d2b6f48b
PB
5453 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5454 return -EINVAL;
5455
270a5940 5456 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 5457 sr->len = READ_ONCE(sqe->len);
bcda7baa 5458 sr->bgid = READ_ONCE(sqe->buf_group);
04411806
PB
5459 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5460 if (sr->msg_flags & MSG_DONTWAIT)
5461 req->flags |= REQ_F_NOWAIT;
06b76d44 5462
d8768362
JA
5463#ifdef CONFIG_COMPAT
5464 if (req->ctx->compat)
5465 sr->msg_flags |= MSG_CMSG_COMPAT;
5466#endif
7ba89d2a 5467 sr->done_io = 0;
93642ef8 5468 return 0;
aa1fa28f
JA
5469}
5470
7ba89d2a
JA
5471static bool io_net_retry(struct socket *sock, int flags)
5472{
5473 if (!(flags & MSG_WAITALL))
5474 return false;
5475 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
5476}
5477
889fca73 5478static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 5479{
6b754c8b 5480 struct io_async_msghdr iomsg, *kmsg;
7ba89d2a 5481 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 5482 struct socket *sock;
7fbb1b54 5483 struct io_buffer *kbuf;
7a7cacba 5484 unsigned flags;
d1fd1c20 5485 int ret, min_ret = 0;
45d189c6 5486 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
03b1230c 5487
dba4a925 5488 sock = sock_from_file(req->file);
7a7cacba 5489 if (unlikely(!sock))
dba4a925 5490 return -ENOTSOCK;
3529d8c2 5491
d886e185
PB
5492 if (req_has_async_data(req)) {
5493 kmsg = req->async_data;
5494 } else {
7a7cacba
PB
5495 ret = io_recvmsg_copy_hdr(req, &iomsg);
5496 if (ret)
681fda8d 5497 return ret;
7a7cacba
PB
5498 kmsg = &iomsg;
5499 }
03b1230c 5500
bc02ef33 5501 if (req->flags & REQ_F_BUFFER_SELECT) {
51aac424 5502 kbuf = io_recv_buffer_select(req, issue_flags);
bc02ef33 5503 if (IS_ERR(kbuf))
52de1fe1 5504 return PTR_ERR(kbuf);
7a7cacba 5505 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5476dfed
PB
5506 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
5507 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
7a7cacba
PB
5508 1, req->sr_msg.len);
5509 }
52de1fe1 5510
04411806
PB
5511 flags = req->sr_msg.msg_flags;
5512 if (force_nonblock)
7a7cacba 5513 flags |= MSG_DONTWAIT;
0031275d
SM
5514 if (flags & MSG_WAITALL)
5515 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5516
7a7cacba
PB
5517 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
5518 kmsg->uaddr, flags);
7297ce3d
PB
5519 if (ret < min_ret) {
5520 if (ret == -EAGAIN && force_nonblock)
5521 return io_setup_async_msg(req, kmsg);
5522 if (ret == -ERESTARTSYS)
5523 ret = -EINTR;
7ba89d2a
JA
5524 if (ret > 0 && io_net_retry(sock, flags)) {
5525 sr->done_io += ret;
8a3e8ee5 5526 req->flags |= REQ_F_PARTIAL_IO;
7ba89d2a
JA
5527 return io_setup_async_msg(req, kmsg);
5528 }
7297ce3d
PB
5529 req_set_fail(req);
5530 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
5531 req_set_fail(req);
5532 }
03b1230c 5533
257e84a5
PB
5534 /* fast path, check for non-NULL to avoid function call */
5535 if (kmsg->free_iov)
5536 kfree(kmsg->free_iov);
99bc4c38 5537 req->flags &= ~REQ_F_NEED_CLEANUP;
7ba89d2a
JA
5538 if (ret >= 0)
5539 ret += sr->done_io;
5540 else if (sr->done_io)
5541 ret = sr->done_io;
cc3cec83 5542 __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
03b1230c 5543 return 0;
0fa03c62 5544}
5d17b4a4 5545
889fca73 5546static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
fddaface 5547{
6b754c8b 5548 struct io_buffer *kbuf;
7a7cacba
PB
5549 struct io_sr_msg *sr = &req->sr_msg;
5550 struct msghdr msg;
5551 void __user *buf = sr->buf;
fddaface 5552 struct socket *sock;
7a7cacba
PB
5553 struct iovec iov;
5554 unsigned flags;
d1fd1c20 5555 int ret, min_ret = 0;
45d189c6 5556 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
fddaface 5557
dba4a925 5558 sock = sock_from_file(req->file);
7a7cacba 5559 if (unlikely(!sock))
dba4a925 5560 return -ENOTSOCK;
fddaface 5561
bc02ef33 5562 if (req->flags & REQ_F_BUFFER_SELECT) {
51aac424 5563 kbuf = io_recv_buffer_select(req, issue_flags);
bcda7baa
JA
5564 if (IS_ERR(kbuf))
5565 return PTR_ERR(kbuf);
7a7cacba 5566 buf = u64_to_user_ptr(kbuf->addr);
bc02ef33 5567 }
bcda7baa 5568
7a7cacba 5569 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
14c32eee
PB
5570 if (unlikely(ret))
5571 goto out_free;
fddaface 5572
7a7cacba
PB
5573 msg.msg_name = NULL;
5574 msg.msg_control = NULL;
5575 msg.msg_controllen = 0;
5576 msg.msg_namelen = 0;
5577 msg.msg_iocb = NULL;
5578 msg.msg_flags = 0;
fddaface 5579
04411806
PB
5580 flags = req->sr_msg.msg_flags;
5581 if (force_nonblock)
7a7cacba 5582 flags |= MSG_DONTWAIT;
0031275d
SM
5583 if (flags & MSG_WAITALL)
5584 min_ret = iov_iter_count(&msg.msg_iter);
5585
7a7cacba 5586 ret = sock_recvmsg(sock, &msg, flags);
7297ce3d
PB
5587 if (ret < min_ret) {
5588 if (ret == -EAGAIN && force_nonblock)
5589 return -EAGAIN;
5590 if (ret == -ERESTARTSYS)
5591 ret = -EINTR;
7ba89d2a
JA
5592 if (ret > 0 && io_net_retry(sock, flags)) {
5593 sr->len -= ret;
5594 sr->buf += ret;
5595 sr->done_io += ret;
8a3e8ee5 5596 req->flags |= REQ_F_PARTIAL_IO;
7ba89d2a
JA
5597 return -EAGAIN;
5598 }
7297ce3d
PB
5599 req_set_fail(req);
5600 } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
0d7c1153 5601out_free:
93d2bcd2 5602 req_set_fail(req);
7297ce3d 5603 }
cc3cec83 5604
7ba89d2a
JA
5605 if (ret >= 0)
5606 ret += sr->done_io;
5607 else if (sr->done_io)
5608 ret = sr->done_io;
cc3cec83 5609 __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
fddaface 5610 return 0;
fddaface
JA
5611}
5612
3529d8c2 5613static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35 5614{
8ed8d3c3
JA
5615 struct io_accept *accept = &req->accept;
5616
14587a46 5617 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
17f2fe35 5618 return -EINVAL;
aaa4db12 5619 if (sqe->ioprio || sqe->len || sqe->buf_index)
17f2fe35
JA
5620 return -EINVAL;
5621
d55e5f5b
JA
5622 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5623 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 5624 accept->flags = READ_ONCE(sqe->accept_flags);
09952e3e 5625 accept->nofile = rlimit(RLIMIT_NOFILE);
a7083ad5 5626
aaa4db12 5627 accept->file_slot = READ_ONCE(sqe->file_index);
adf3a9e9 5628 if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
aaa4db12 5629 return -EINVAL;
a7083ad5
PB
5630 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5631 return -EINVAL;
5632 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5633 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
8ed8d3c3 5634 return 0;
8ed8d3c3 5635}
17f2fe35 5636
889fca73 5637static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3
JA
5638{
5639 struct io_accept *accept = &req->accept;
45d189c6 5640 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ac45abc0 5641 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
aaa4db12 5642 bool fixed = !!accept->file_slot;
a7083ad5
PB
5643 struct file *file;
5644 int ret, fd;
8ed8d3c3 5645
aaa4db12
PB
5646 if (!fixed) {
5647 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5648 if (unlikely(fd < 0))
5649 return fd;
5650 }
a7083ad5
PB
5651 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5652 accept->flags);
5653 if (IS_ERR(file)) {
aaa4db12
PB
5654 if (!fixed)
5655 put_unused_fd(fd);
a7083ad5
PB
5656 ret = PTR_ERR(file);
5657 if (ret == -EAGAIN && force_nonblock)
5658 return -EAGAIN;
ac45abc0
PB
5659 if (ret == -ERESTARTSYS)
5660 ret = -EINTR;
93d2bcd2 5661 req_set_fail(req);
aaa4db12 5662 } else if (!fixed) {
a7083ad5
PB
5663 fd_install(fd, file);
5664 ret = fd;
aaa4db12
PB
5665 } else {
5666 ret = io_install_fixed_file(req, file, issue_flags,
5667 accept->file_slot - 1);
ac45abc0 5668 }
889fca73 5669 __io_req_complete(req, issue_flags, ret, 0);
17f2fe35 5670 return 0;
8ed8d3c3
JA
5671}
5672
93642ef8
PB
5673static int io_connect_prep_async(struct io_kiocb *req)
5674{
5675 struct io_async_connect *io = req->async_data;
5676 struct io_connect *conn = &req->connect;
5677
5678 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5679}
5680
3529d8c2 5681static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021 5682{
3529d8c2 5683 struct io_connect *conn = &req->connect;
f499a021 5684
14587a46 5685 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3fbb51c1 5686 return -EINVAL;
26578cda
PB
5687 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5688 sqe->splice_fd_in)
3fbb51c1
JA
5689 return -EINVAL;
5690
3529d8c2
JA
5691 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5692 conn->addr_len = READ_ONCE(sqe->addr2);
93642ef8 5693 return 0;
f499a021
JA
5694}
5695
889fca73 5696static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
f8e85cf2 5697{
e8c2bc1f 5698 struct io_async_connect __io, *io;
f8e85cf2 5699 unsigned file_flags;
3fbb51c1 5700 int ret;
45d189c6 5701 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
f8e85cf2 5702
d886e185 5703 if (req_has_async_data(req)) {
e8c2bc1f 5704 io = req->async_data;
f499a021 5705 } else {
3529d8c2
JA
5706 ret = move_addr_to_kernel(req->connect.addr,
5707 req->connect.addr_len,
e8c2bc1f 5708 &__io.address);
f499a021
JA
5709 if (ret)
5710 goto out;
5711 io = &__io;
5712 }
5713
3fbb51c1
JA
5714 file_flags = force_nonblock ? O_NONBLOCK : 0;
5715
e8c2bc1f 5716 ret = __sys_connect_file(req->file, &io->address,
3fbb51c1 5717 req->connect.addr_len, file_flags);
87f80d62 5718 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
d886e185 5719 if (req_has_async_data(req))
b7bb4f7d 5720 return -EAGAIN;
e8c2bc1f 5721 if (io_alloc_async_data(req)) {
f499a021
JA
5722 ret = -ENOMEM;
5723 goto out;
5724 }
e8c2bc1f 5725 memcpy(req->async_data, &__io, sizeof(__io));
f8e85cf2 5726 return -EAGAIN;
f499a021 5727 }
f8e85cf2
JA
5728 if (ret == -ERESTARTSYS)
5729 ret = -EINTR;
f499a021 5730out:
4e88d6e7 5731 if (ret < 0)
93d2bcd2 5732 req_set_fail(req);
889fca73 5733 __io_req_complete(req, issue_flags, ret, 0);
f8e85cf2 5734 return 0;
469956e8
Y
5735}
5736#else /* !CONFIG_NET */
99a10081
JA
5737#define IO_NETOP_FN(op) \
5738static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
5739{ \
5740 return -EOPNOTSUPP; \
5741}
5742
5743#define IO_NETOP_PREP(op) \
5744IO_NETOP_FN(op) \
5745static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5746{ \
5747 return -EOPNOTSUPP; \
5748} \
5749
5750#define IO_NETOP_PREP_ASYNC(op) \
5751IO_NETOP_PREP(op) \
5752static int io_##op##_prep_async(struct io_kiocb *req) \
5753{ \
5754 return -EOPNOTSUPP; \
5755}
5756
5757IO_NETOP_PREP_ASYNC(sendmsg);
5758IO_NETOP_PREP_ASYNC(recvmsg);
5759IO_NETOP_PREP_ASYNC(connect);
5760IO_NETOP_PREP(accept);
5761IO_NETOP_FN(send);
5762IO_NETOP_FN(recv);
469956e8 5763#endif /* CONFIG_NET */
f8e85cf2 5764
d7718a9d
JA
5765struct io_poll_table {
5766 struct poll_table_struct pt;
5767 struct io_kiocb *req;
68b11e8b 5768 int nr_entries;
d7718a9d
JA
5769 int error;
5770};
ce593a6c 5771
aa43477b 5772#define IO_POLL_CANCEL_FLAG BIT(31)
e2c0cb7c 5773#define IO_POLL_REF_MASK GENMASK(30, 0)
6d816e08 5774
aa43477b
PB
5775/*
5776 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
5777 * bump it and acquire ownership. It's disallowed to modify requests while not
5778 * owning it, that prevents from races for enqueueing task_work's and b/w
5779 * arming poll and wakeups.
5780 */
5781static inline bool io_poll_get_ownership(struct io_kiocb *req)
5782{
5783 return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
d7718a9d
JA
5784}
5785
aa43477b 5786static void io_poll_mark_cancelled(struct io_kiocb *req)
74ce6ce4 5787{
aa43477b 5788 atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
74ce6ce4
JA
5789}
5790
d4e7cd36 5791static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
18bceab1 5792{
e8c2bc1f 5793 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
d4e7cd36 5794 if (req->opcode == IORING_OP_POLL_ADD)
e8c2bc1f 5795 return req->async_data;
d4e7cd36
JA
5796 return req->apoll->double_poll;
5797}
5798
5799static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5800{
5801 if (req->opcode == IORING_OP_POLL_ADD)
5802 return &req->poll;
5803 return &req->apoll->poll;
5804}
5805
5641897a 5806static void io_poll_req_insert(struct io_kiocb *req)
d4e7cd36 5807{
5641897a
PB
5808 struct io_ring_ctx *ctx = req->ctx;
5809 struct hlist_head *list;
18bceab1 5810
cef216fc 5811 list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
5641897a 5812 hlist_add_head(&req->hash_node, list);
18bceab1
JA
5813}
5814
5641897a
PB
5815static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5816 wait_queue_func_t wake_func)
18bceab1 5817{
5641897a 5818 poll->head = NULL;
5641897a
PB
5819#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5820 /* mask in events that we always want/need */
5821 poll->events = events | IO_POLL_UNMASK;
5822 INIT_LIST_HEAD(&poll->wait.entry);
5823 init_waitqueue_func_entry(&poll->wait, wake_func);
18bceab1
JA
5824}
5825
aa43477b 5826static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
18bceab1 5827{
791f3465 5828 struct wait_queue_head *head = smp_load_acquire(&poll->head);
18bceab1 5829
791f3465
PB
5830 if (head) {
5831 spin_lock_irq(&head->lock);
5832 list_del_init(&poll->wait.entry);
5833 poll->head = NULL;
5834 spin_unlock_irq(&head->lock);
5835 }
aa43477b 5836}
18bceab1 5837
aa43477b
PB
5838static void io_poll_remove_entries(struct io_kiocb *req)
5839{
91eac1c6
JA
5840 /*
5841 * Nothing to do if neither of those flags are set. Avoid dipping
5842 * into the poll/apoll/double cachelines if we can.
5843 */
5844 if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
5845 return;
18bceab1 5846
791f3465
PB
5847 /*
5848 * While we hold the waitqueue lock and the waitqueue is nonempty,
5849 * wake_up_pollfree() will wait for us. However, taking the waitqueue
5850 * lock in the first place can race with the waitqueue being freed.
5851 *
5852 * We solve this as eventpoll does: by taking advantage of the fact that
5853 * all users of wake_up_pollfree() will RCU-delay the actual free. If
5854 * we enter rcu_read_lock() and see that the pointer to the queue is
5855 * non-NULL, we can then lock it without the memory being freed out from
5856 * under us.
5857 *
5858 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
5859 * case the caller deletes the entry from the queue, leaving it empty.
5860 * In that case, only RCU prevents the queue memory from being freed.
5861 */
5862 rcu_read_lock();
91eac1c6
JA
5863 if (req->flags & REQ_F_SINGLE_POLL)
5864 io_poll_remove_entry(io_poll_get_single(req));
5865 if (req->flags & REQ_F_DOUBLE_POLL)
5866 io_poll_remove_entry(io_poll_get_double(req));
791f3465 5867 rcu_read_unlock();
18bceab1
JA
5868}
5869
aa43477b
PB
5870/*
5871 * All poll tw should go through this. Checks for poll events, manages
5872 * references, does rewait, etc.
5873 *
5874 * Returns a negative error on failure. >0 when no action require, which is
5875 * either spurious wakeup or multishot CQE is served. 0 when it's done with
cef216fc 5876 * the request, then the mask is stored in req->cqe.res.
aa43477b 5877 */
5106dd6e 5878static int io_poll_check_events(struct io_kiocb *req, bool locked)
18bceab1 5879{
74ce6ce4 5880 struct io_ring_ctx *ctx = req->ctx;
aa43477b 5881 int v;
18bceab1 5882
316319e8 5883 /* req->task == current here, checking PF_EXITING is safe */
e09ee510 5884 if (unlikely(req->task->flags & PF_EXITING))
f2219057 5885 return -ECANCELED;
18bceab1 5886
aa43477b
PB
5887 do {
5888 v = atomic_read(&req->poll_refs);
74ce6ce4 5889
aa43477b
PB
5890 /* tw handler should be the owner, and so have some references */
5891 if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
5892 return 0;
5893 if (v & IO_POLL_CANCEL_FLAG)
5894 return -ECANCELED;
8706e04e 5895
cef216fc 5896 if (!req->cqe.res) {
2804ecd8 5897 struct poll_table_struct pt = { ._key = req->apoll_events };
cce64ef0 5898 unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
18bceab1 5899
cce64ef0 5900 if (unlikely(!io_assign_file(req, flags)))
7179c3ce 5901 return -EBADF;
cef216fc 5902 req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
c8b5e260 5903 }
74ce6ce4 5904
aa43477b 5905 /* multishot, just fill an CQE and proceed */
cef216fc
PB
5906 if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) {
5907 __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events);
aa43477b 5908 bool filled;
18bceab1 5909
aa43477b 5910 spin_lock(&ctx->completion_lock);
cef216fc 5911 filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask,
aa43477b
PB
5912 IORING_CQE_F_MORE);
5913 io_commit_cqring(ctx);
5914 spin_unlock(&ctx->completion_lock);
5915 if (unlikely(!filled))
5916 return -ECANCELED;
5917 io_cqring_ev_posted(ctx);
cef216fc 5918 } else if (req->cqe.res) {
aa43477b
PB
5919 return 0;
5920 }
18bceab1 5921
aa43477b
PB
5922 /*
5923 * Release all references, retry if someone tried to restart
5924 * task_work while we were executing it.
5925 */
5926 } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
18bceab1 5927
18bceab1
JA
5928 return 1;
5929}
5930
aa43477b 5931static void io_poll_task_func(struct io_kiocb *req, bool *locked)
18bceab1 5932{
18bceab1 5933 struct io_ring_ctx *ctx = req->ctx;
aa43477b 5934 int ret;
18bceab1 5935
5106dd6e 5936 ret = io_poll_check_events(req, *locked);
aa43477b
PB
5937 if (ret > 0)
5938 return;
5939
5940 if (!ret) {
cef216fc 5941 req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
e27414be 5942 } else {
cef216fc 5943 req->cqe.res = ret;
aa43477b 5944 req_set_fail(req);
a62682f9 5945 }
aa43477b
PB
5946
5947 io_poll_remove_entries(req);
5948 spin_lock(&ctx->completion_lock);
5949 hash_del(&req->hash_node);
cef216fc 5950 __io_req_complete_post(req, req->cqe.res, 0);
aa43477b
PB
5951 io_commit_cqring(ctx);
5952 spin_unlock(&ctx->completion_lock);
5953 io_cqring_ev_posted(ctx);
18bceab1
JA
5954}
5955
aa43477b 5956static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
18bceab1
JA
5957{
5958 struct io_ring_ctx *ctx = req->ctx;
aa43477b 5959 int ret;
18bceab1 5960
5106dd6e 5961 ret = io_poll_check_events(req, *locked);
aa43477b
PB
5962 if (ret > 0)
5963 return;
18bceab1 5964
aa43477b
PB
5965 io_poll_remove_entries(req);
5966 spin_lock(&ctx->completion_lock);
5967 hash_del(&req->hash_node);
5968 spin_unlock(&ctx->completion_lock);
18bceab1 5969
aa43477b
PB
5970 if (!ret)
5971 io_req_task_submit(req, locked);
5972 else
5973 io_req_complete_failed(req, ret);
18bceab1
JA
5974}
5975
81459350 5976static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
aa43477b 5977{
cef216fc 5978 req->cqe.res = mask;
81459350
JA
5979 /*
5980 * This is useful for poll that is armed on behalf of another
5981 * request, and where the wakeup path could be on a different
5982 * CPU. We want to avoid pulling in req->apoll->events for that
5983 * case.
5984 */
2804ecd8 5985 req->apoll_events = events;
aa43477b
PB
5986 if (req->opcode == IORING_OP_POLL_ADD)
5987 req->io_task_work.func = io_poll_task_func;
5988 else
5989 req->io_task_work.func = io_apoll_task_func;
5990
cef216fc 5991 trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
aa43477b
PB
5992 io_req_task_work_add(req, false);
5993}
5994
81459350 5995static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
aa43477b
PB
5996{
5997 if (io_poll_get_ownership(req))
81459350 5998 __io_poll_execute(req, res, events);
aa43477b
PB
5999}
6000
6001static void io_poll_cancel_req(struct io_kiocb *req)
6002{
6003 io_poll_mark_cancelled(req);
6004 /* kick tw, which should complete the request */
81459350 6005 io_poll_execute(req, 0, 0);
aa43477b
PB
6006}
6007
d89a4fac
JA
6008#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
6009#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
6010
aa43477b
PB
6011static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
6012 void *key)
18bceab1 6013{
d89a4fac 6014 struct io_kiocb *req = wqe_to_req(wait);
aa43477b
PB
6015 struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
6016 wait);
18bceab1
JA
6017 __poll_t mask = key_to_poll(key);
6018
791f3465
PB
6019 if (unlikely(mask & POLLFREE)) {
6020 io_poll_mark_cancelled(req);
6021 /* we have to kick tw in case it's not already */
81459350 6022 io_poll_execute(req, 0, poll->events);
791f3465
PB
6023
6024 /*
6025 * If the waitqueue is being freed early but someone is already
6026 * holds ownership over it, we have to tear down the request as
6027 * best we can. That means immediately removing the request from
6028 * its waitqueue and preventing all further accesses to the
6029 * waitqueue via the request.
6030 */
6031 list_del_init(&poll->wait.entry);
6032
6033 /*
6034 * Careful: this *must* be the last step, since as soon
6035 * as req->head is NULL'ed out, the request can be
6036 * completed and freed, since aio_poll_complete_work()
6037 * will no longer need to take the waitqueue lock.
6038 */
6039 smp_store_release(&poll->head, NULL);
6040 return 1;
6041 }
6042
aa43477b 6043 /* for instances that support it check for an event match first */
18bceab1
JA
6044 if (mask && !(mask & poll->events))
6045 return 0;
6046
eb0089d6
PB
6047 if (io_poll_get_ownership(req)) {
6048 /* optional, saves extra locking for removal in tw handler */
6049 if (mask && poll->events & EPOLLONESHOT) {
6050 list_del_init(&poll->wait.entry);
6051 poll->head = NULL;
d89a4fac
JA
6052 if (wqe_is_double(wait))
6053 req->flags &= ~REQ_F_DOUBLE_POLL;
6054 else
6055 req->flags &= ~REQ_F_SINGLE_POLL;
eb0089d6 6056 }
81459350 6057 __io_poll_execute(req, mask, poll->events);
eb0089d6 6058 }
18bceab1 6059 return 1;
18bceab1
JA
6060}
6061
6062static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
807abcb0
JA
6063 struct wait_queue_head *head,
6064 struct io_poll_iocb **poll_ptr)
18bceab1
JA
6065{
6066 struct io_kiocb *req = pt->req;
d89a4fac 6067 unsigned long wqe_private = (unsigned long) req;
18bceab1
JA
6068
6069 /*
68b11e8b
PB
6070 * The file being polled uses multiple waitqueues for poll handling
6071 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
6072 * if this happens.
18bceab1 6073 */
68b11e8b 6074 if (unlikely(pt->nr_entries)) {
aa43477b 6075 struct io_poll_iocb *first = poll;
58852d4d 6076
23a65db8 6077 /* double add on the same waitqueue head, ignore */
aa43477b 6078 if (first->head == head)
23a65db8 6079 return;
18bceab1 6080 /* already have a 2nd entry, fail a third attempt */
807abcb0 6081 if (*poll_ptr) {
23a65db8
PB
6082 if ((*poll_ptr)->head == head)
6083 return;
18bceab1
JA
6084 pt->error = -EINVAL;
6085 return;
6086 }
aa43477b 6087
18bceab1
JA
6088 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
6089 if (!poll) {
6090 pt->error = -ENOMEM;
6091 return;
6092 }
d89a4fac
JA
6093 /* mark as double wq entry */
6094 wqe_private |= 1;
91eac1c6 6095 req->flags |= REQ_F_DOUBLE_POLL;
aa43477b 6096 io_init_poll_iocb(poll, first->events, first->wait.func);
807abcb0 6097 *poll_ptr = poll;
d886e185
PB
6098 if (req->opcode == IORING_OP_POLL_ADD)
6099 req->flags |= REQ_F_ASYNC_DATA;
18bceab1
JA
6100 }
6101
91eac1c6 6102 req->flags |= REQ_F_SINGLE_POLL;
68b11e8b 6103 pt->nr_entries++;
18bceab1 6104 poll->head = head;
d89a4fac 6105 poll->wait.private = (void *) wqe_private;
a31eb4a2
JX
6106
6107 if (poll->events & EPOLLEXCLUSIVE)
6108 add_wait_queue_exclusive(head, &poll->wait);
6109 else
6110 add_wait_queue(head, &poll->wait);
18bceab1
JA
6111}
6112
aa43477b 6113static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
18bceab1
JA
6114 struct poll_table_struct *p)
6115{
6116 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
d7718a9d 6117
aa43477b
PB
6118 __io_queue_proc(&pt->req->poll, pt, head,
6119 (struct io_poll_iocb **) &pt->req->async_data);
d7718a9d
JA
6120}
6121
aa43477b
PB
6122static int __io_arm_poll_handler(struct io_kiocb *req,
6123 struct io_poll_iocb *poll,
6124 struct io_poll_table *ipt, __poll_t mask)
d7718a9d
JA
6125{
6126 struct io_ring_ctx *ctx = req->ctx;
aa43477b 6127 int v;
d7718a9d 6128
4d52f338 6129 INIT_HLIST_NODE(&req->hash_node);
aa43477b 6130 io_init_poll_iocb(poll, mask, io_poll_wake);
b90cd197 6131 poll->file = req->file;
d7718a9d
JA
6132
6133 ipt->pt._key = mask;
6134 ipt->req = req;
68b11e8b
PB
6135 ipt->error = 0;
6136 ipt->nr_entries = 0;
d7718a9d 6137
aa43477b
PB
6138 /*
6139 * Take the ownership to delay any tw execution up until we're done
6140 * with poll arming. see io_poll_get_ownership().
6141 */
6142 atomic_set(&req->poll_refs, 1);
d7718a9d 6143 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
aa43477b
PB
6144
6145 if (mask && (poll->events & EPOLLONESHOT)) {
6146 io_poll_remove_entries(req);
6147 /* no one else has access to the req, forget about the ref */
6148 return mask;
6149 }
6150 if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
6151 io_poll_remove_entries(req);
6152 if (!ipt->error)
6153 ipt->error = -EINVAL;
6154 return 0;
6155 }
d7718a9d 6156
79ebeaee 6157 spin_lock(&ctx->completion_lock);
aa43477b
PB
6158 io_poll_req_insert(req);
6159 spin_unlock(&ctx->completion_lock);
6160
6161 if (mask) {
6162 /* can't multishot if failed, just queue the event we've got */
6163 if (unlikely(ipt->error || !ipt->nr_entries))
6164 poll->events |= EPOLLONESHOT;
81459350 6165 __io_poll_execute(req, mask, poll->events);
aa43477b 6166 return 0;
d7718a9d
JA
6167 }
6168
aa43477b
PB
6169 /*
6170 * Release ownership. If someone tried to queue a tw while it was
6171 * locked, kick it off for them.
6172 */
6173 v = atomic_dec_return(&req->poll_refs);
6174 if (unlikely(v & IO_POLL_REF_MASK))
81459350 6175 __io_poll_execute(req, 0, poll->events);
aa43477b
PB
6176 return 0;
6177}
6178
6179static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
6180 struct poll_table_struct *p)
6181{
6182 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
6183 struct async_poll *apoll = pt->req->apoll;
6184
6185 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
d7718a9d
JA
6186}
6187
59b735ae
OL
6188enum {
6189 IO_APOLL_OK,
6190 IO_APOLL_ABORTED,
6191 IO_APOLL_READY
6192};
6193
4d9237e3 6194static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
d7718a9d
JA
6195{
6196 const struct io_op_def *def = &io_op_defs[req->opcode];
6197 struct io_ring_ctx *ctx = req->ctx;
6198 struct async_poll *apoll;
6199 struct io_poll_table ipt;
aa43477b
PB
6200 __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
6201 int ret;
d7718a9d 6202
b2d9c3da
PB
6203 if (!def->pollin && !def->pollout)
6204 return IO_APOLL_ABORTED;
658d0a40
PB
6205 if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
6206 return IO_APOLL_ABORTED;
b2d9c3da
PB
6207
6208 if (def->pollin) {
b2d9c3da
PB
6209 mask |= POLLIN | POLLRDNORM;
6210
6211 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
6212 if ((req->opcode == IORING_OP_RECVMSG) &&
6213 (req->sr_msg.msg_flags & MSG_ERRQUEUE))
6214 mask &= ~POLLIN;
6215 } else {
b2d9c3da
PB
6216 mask |= POLLOUT | POLLWRNORM;
6217 }
52dd8640
DY
6218 if (def->poll_exclusive)
6219 mask |= EPOLLEXCLUSIVE;
4d9237e3
JA
6220 if (!(issue_flags & IO_URING_F_UNLOCKED) &&
6221 !list_empty(&ctx->apoll_cache)) {
6222 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
6223 poll.wait.entry);
6224 list_del_init(&apoll->poll.wait.entry);
6225 } else {
6226 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
6227 if (unlikely(!apoll))
6228 return IO_APOLL_ABORTED;
6229 }
807abcb0 6230 apoll->double_poll = NULL;
d7718a9d 6231 req->apoll = apoll;
b2d9c3da 6232 req->flags |= REQ_F_POLLED;
d7718a9d
JA
6233 ipt.pt._qproc = io_async_queue_proc;
6234
4d55f238 6235 io_kbuf_recycle(req, issue_flags);
abdad709 6236
aa43477b 6237 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
41a5169c
HX
6238 if (ret || ipt.error)
6239 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
6240
cef216fc 6241 trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
236daeae 6242 mask, apoll->poll.events);
59b735ae 6243 return IO_APOLL_OK;
d7718a9d
JA
6244}
6245
76e1b642
JA
6246/*
6247 * Returns true if we found and killed one or more poll requests
6248 */
c072481d
PB
6249static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
6250 struct task_struct *tsk, bool cancel_all)
221c5eb2 6251{
78076bb6 6252 struct hlist_node *tmp;
221c5eb2 6253 struct io_kiocb *req;
aa43477b
PB
6254 bool found = false;
6255 int i;
221c5eb2 6256
79ebeaee 6257 spin_lock(&ctx->completion_lock);
78076bb6
JA
6258 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
6259 struct hlist_head *list;
6260
6261 list = &ctx->cancel_hash[i];
f3606e3a 6262 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
42a7b4ed 6263 if (io_match_task_safe(req, tsk, cancel_all)) {
61bc84c4 6264 hlist_del_init(&req->hash_node);
aa43477b
PB
6265 io_poll_cancel_req(req);
6266 found = true;
6267 }
f3606e3a 6268 }
221c5eb2 6269 }
79ebeaee 6270 spin_unlock(&ctx->completion_lock);
aa43477b 6271 return found;
221c5eb2
JA
6272}
6273
9ba5fac8
PB
6274static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
6275 bool poll_only)
e07785b0 6276 __must_hold(&ctx->completion_lock)
47f46768 6277{
78076bb6 6278 struct hlist_head *list;
47f46768
JA
6279 struct io_kiocb *req;
6280
78076bb6
JA
6281 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
6282 hlist_for_each_entry(req, list, hash_node) {
cef216fc 6283 if (sqe_addr != req->cqe.user_data)
b41e9852 6284 continue;
9ba5fac8
PB
6285 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
6286 continue;
b2cb805f 6287 return req;
47f46768 6288 }
b2cb805f
JA
6289 return NULL;
6290}
6291
aa43477b
PB
6292static bool io_poll_disarm(struct io_kiocb *req)
6293 __must_hold(&ctx->completion_lock)
6294{
6295 if (!io_poll_get_ownership(req))
6296 return false;
6297 io_poll_remove_entries(req);
6298 hash_del(&req->hash_node);
6299 return true;
6300}
6301
9ba5fac8
PB
6302static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
6303 bool poll_only)
e07785b0 6304 __must_hold(&ctx->completion_lock)
b2cb805f 6305{
aa43477b 6306 struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
b2cb805f 6307
b2cb805f
JA
6308 if (!req)
6309 return -ENOENT;
aa43477b
PB
6310 io_poll_cancel_req(req);
6311 return 0;
47f46768
JA
6312}
6313
9096af3e
PB
6314static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
6315 unsigned int flags)
6316{
6317 u32 events;
47f46768 6318
9096af3e
PB
6319 events = READ_ONCE(sqe->poll32_events);
6320#ifdef __BIG_ENDIAN
6321 events = swahw32(events);
6322#endif
6323 if (!(flags & IORING_POLL_ADD_MULTI))
6324 events |= EPOLLONESHOT;
6325 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
47f46768
JA
6326}
6327
c5de0036 6328static int io_poll_update_prep(struct io_kiocb *req,
3529d8c2 6329 const struct io_uring_sqe *sqe)
0969e783 6330{
c5de0036
PB
6331 struct io_poll_update *upd = &req->poll_update;
6332 u32 flags;
6333
0969e783
JA
6334 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6335 return -EINVAL;
26578cda 6336 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
c5de0036
PB
6337 return -EINVAL;
6338 flags = READ_ONCE(sqe->len);
6339 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
6340 IORING_POLL_ADD_MULTI))
6341 return -EINVAL;
6342 /* meaningless without update */
6343 if (flags == IORING_POLL_ADD_MULTI)
0969e783
JA
6344 return -EINVAL;
6345
c5de0036
PB
6346 upd->old_user_data = READ_ONCE(sqe->addr);
6347 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
6348 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
221c5eb2 6349
c5de0036
PB
6350 upd->new_user_data = READ_ONCE(sqe->off);
6351 if (!upd->update_user_data && upd->new_user_data)
6352 return -EINVAL;
6353 if (upd->update_events)
6354 upd->events = io_poll_parse_events(sqe, flags);
6355 else if (sqe->poll32_events)
6356 return -EINVAL;
221c5eb2 6357
221c5eb2
JA
6358 return 0;
6359}
6360
3529d8c2 6361static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
6362{
6363 struct io_poll_iocb *poll = &req->poll;
c5de0036 6364 u32 flags;
221c5eb2
JA
6365
6366 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6367 return -EINVAL;
c5de0036 6368 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
88e41cf9
JA
6369 return -EINVAL;
6370 flags = READ_ONCE(sqe->len);
c5de0036 6371 if (flags & ~IORING_POLL_ADD_MULTI)
221c5eb2 6372 return -EINVAL;
04c76b41
PB
6373 if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
6374 return -EINVAL;
221c5eb2 6375
48dcd38d 6376 io_req_set_refcount(req);
2804ecd8 6377 req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
0969e783
JA
6378 return 0;
6379}
6380
61e98203 6381static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
0969e783
JA
6382{
6383 struct io_poll_iocb *poll = &req->poll;
0969e783 6384 struct io_poll_table ipt;
aa43477b 6385 int ret;
0969e783 6386
d7718a9d 6387 ipt.pt._qproc = io_poll_queue_proc;
36703247 6388
aa43477b
PB
6389 ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
6390 ret = ret ?: ipt.error;
6391 if (ret)
6392 __io_req_complete(req, issue_flags, ret, 0);
6393 return 0;
221c5eb2
JA
6394}
6395
c5de0036 6396static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
b69de288
JA
6397{
6398 struct io_ring_ctx *ctx = req->ctx;
6399 struct io_kiocb *preq;
2bbb146d 6400 int ret2, ret = 0;
cc8e9ba7 6401 bool locked;
b69de288 6402
79ebeaee 6403 spin_lock(&ctx->completion_lock);
9ba5fac8 6404 preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
aa43477b 6405 if (!preq || !io_poll_disarm(preq)) {
79ebeaee 6406 spin_unlock(&ctx->completion_lock);
aa43477b 6407 ret = preq ? -EALREADY : -ENOENT;
2bbb146d 6408 goto out;
b69de288 6409 }
79ebeaee 6410 spin_unlock(&ctx->completion_lock);
cb3b200e 6411
2bbb146d
PB
6412 if (req->poll_update.update_events || req->poll_update.update_user_data) {
6413 /* only mask one event flags, keep behavior flags */
6414 if (req->poll_update.update_events) {
6415 preq->poll.events &= ~0xffff;
6416 preq->poll.events |= req->poll_update.events & 0xffff;
6417 preq->poll.events |= IO_POLL_UNMASK;
cb3b200e 6418 }
2bbb146d 6419 if (req->poll_update.update_user_data)
cef216fc 6420 preq->cqe.user_data = req->poll_update.new_user_data;
b69de288 6421
2bbb146d
PB
6422 ret2 = io_poll_add(preq, issue_flags);
6423 /* successfully updated, don't complete poll request */
6424 if (!ret2)
6425 goto out;
b69de288 6426 }
6224590d 6427
2bbb146d 6428 req_set_fail(preq);
cef216fc 6429 preq->cqe.res = -ECANCELED;
cc8e9ba7
PB
6430 locked = !(issue_flags & IO_URING_F_UNLOCKED);
6431 io_req_task_complete(preq, &locked);
2bbb146d
PB
6432out:
6433 if (ret < 0)
6224590d 6434 req_set_fail(req);
2bbb146d 6435 /* complete update request, we're done with it */
cc8e9ba7 6436 __io_req_complete(req, issue_flags, ret, 0);
b69de288 6437 return 0;
89850fce
JA
6438}
6439
5262f567
JA
6440static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
6441{
ad8a48ac
JA
6442 struct io_timeout_data *data = container_of(timer,
6443 struct io_timeout_data, timer);
6444 struct io_kiocb *req = data->req;
6445 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
6446 unsigned long flags;
6447
89850fce 6448 spin_lock_irqsave(&ctx->timeout_lock, flags);
a71976f3 6449 list_del_init(&req->timeout.list);
01cec8c1
PB
6450 atomic_set(&req->ctx->cq_timeouts,
6451 atomic_read(&req->ctx->cq_timeouts) + 1);
89850fce 6452 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
01cec8c1 6453
a90c8bf6
PB
6454 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
6455 req_set_fail(req);
6456
cef216fc 6457 req->cqe.res = -ETIME;
a90c8bf6 6458 req->io_task_work.func = io_req_task_complete;
4813c377 6459 io_req_task_work_add(req, false);
5262f567
JA
6460 return HRTIMER_NORESTART;
6461}
6462
fbd15848
PB
6463static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
6464 __u64 user_data)
89850fce 6465 __must_hold(&ctx->timeout_lock)
f254ac04 6466{
fbd15848 6467 struct io_timeout_data *io;
47f46768 6468 struct io_kiocb *req;
fd9c7bc5 6469 bool found = false;
f254ac04 6470
135fcde8 6471 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
cef216fc 6472 found = user_data == req->cqe.user_data;
fd9c7bc5 6473 if (found)
47f46768 6474 break;
47f46768 6475 }
fd9c7bc5
PB
6476 if (!found)
6477 return ERR_PTR(-ENOENT);
fbd15848
PB
6478
6479 io = req->async_data;
fd9c7bc5 6480 if (hrtimer_try_to_cancel(&io->timer) == -1)
fbd15848 6481 return ERR_PTR(-EALREADY);
a71976f3 6482 list_del_init(&req->timeout.list);
fbd15848
PB
6483 return req;
6484}
47f46768 6485
fbd15848 6486static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
ec3c3d0f 6487 __must_hold(&ctx->completion_lock)
89850fce 6488 __must_hold(&ctx->timeout_lock)
fbd15848
PB
6489{
6490 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6491
6492 if (IS_ERR(req))
6493 return PTR_ERR(req);
6695490d 6494 io_req_task_queue_fail(req, -ECANCELED);
f254ac04
JA
6495 return 0;
6496}
6497
50c1df2b
JA
6498static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
6499{
6500 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
6501 case IORING_TIMEOUT_BOOTTIME:
6502 return CLOCK_BOOTTIME;
6503 case IORING_TIMEOUT_REALTIME:
6504 return CLOCK_REALTIME;
6505 default:
6506 /* can't happen, vetted at prep time */
6507 WARN_ON_ONCE(1);
6508 fallthrough;
6509 case 0:
6510 return CLOCK_MONOTONIC;
6511 }
6512}
6513
f1042b6c
PB
6514static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6515 struct timespec64 *ts, enum hrtimer_mode mode)
6516 __must_hold(&ctx->timeout_lock)
6517{
6518 struct io_timeout_data *io;
6519 struct io_kiocb *req;
6520 bool found = false;
6521
6522 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
cef216fc 6523 found = user_data == req->cqe.user_data;
f1042b6c
PB
6524 if (found)
6525 break;
6526 }
6527 if (!found)
6528 return -ENOENT;
6529
6530 io = req->async_data;
6531 if (hrtimer_try_to_cancel(&io->timer) == -1)
6532 return -EALREADY;
6533 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
6534 io->timer.function = io_link_timeout_fn;
6535 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
6536 return 0;
6537}
6538
9c8e11b3
PB
6539static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6540 struct timespec64 *ts, enum hrtimer_mode mode)
89850fce 6541 __must_hold(&ctx->timeout_lock)
47f46768 6542{
9c8e11b3
PB
6543 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6544 struct io_timeout_data *data;
47f46768 6545
9c8e11b3
PB
6546 if (IS_ERR(req))
6547 return PTR_ERR(req);
47f46768 6548
9c8e11b3
PB
6549 req->timeout.off = 0; /* noseq */
6550 data = req->async_data;
6551 list_add_tail(&req->timeout.list, &ctx->timeout_list);
50c1df2b 6552 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
9c8e11b3
PB
6553 data->timer.function = io_timeout_fn;
6554 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
6555 return 0;
47f46768
JA
6556}
6557
3529d8c2
JA
6558static int io_timeout_remove_prep(struct io_kiocb *req,
6559 const struct io_uring_sqe *sqe)
b29472ee 6560{
9c8e11b3
PB
6561 struct io_timeout_rem *tr = &req->timeout_rem;
6562
b29472ee
JA
6563 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6564 return -EINVAL;
61710e43
DA
6565 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6566 return -EINVAL;
26578cda 6567 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
b29472ee
JA
6568 return -EINVAL;
6569
f1042b6c 6570 tr->ltimeout = false;
9c8e11b3
PB
6571 tr->addr = READ_ONCE(sqe->addr);
6572 tr->flags = READ_ONCE(sqe->timeout_flags);
f1042b6c
PB
6573 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
6574 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6575 return -EINVAL;
6576 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
6577 tr->ltimeout = true;
6578 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
9c8e11b3
PB
6579 return -EINVAL;
6580 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6581 return -EFAULT;
2087009c
YB
6582 if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
6583 return -EINVAL;
9c8e11b3
PB
6584 } else if (tr->flags) {
6585 /* timeout removal doesn't support flags */
b29472ee 6586 return -EINVAL;
9c8e11b3 6587 }
b29472ee 6588
b29472ee
JA
6589 return 0;
6590}
6591
8662daec
PB
6592static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6593{
6594 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6595 : HRTIMER_MODE_REL;
6596}
6597
11365043
JA
6598/*
6599 * Remove or update an existing timeout command
6600 */
61e98203 6601static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
11365043 6602{
9c8e11b3 6603 struct io_timeout_rem *tr = &req->timeout_rem;
11365043 6604 struct io_ring_ctx *ctx = req->ctx;
47f46768 6605 int ret;
11365043 6606
ec3c3d0f
PB
6607 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6608 spin_lock(&ctx->completion_lock);
6609 spin_lock_irq(&ctx->timeout_lock);
9c8e11b3 6610 ret = io_timeout_cancel(ctx, tr->addr);
ec3c3d0f
PB
6611 spin_unlock_irq(&ctx->timeout_lock);
6612 spin_unlock(&ctx->completion_lock);
6613 } else {
f1042b6c
PB
6614 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6615
ec3c3d0f 6616 spin_lock_irq(&ctx->timeout_lock);
f1042b6c
PB
6617 if (tr->ltimeout)
6618 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6619 else
6620 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
ec3c3d0f
PB
6621 spin_unlock_irq(&ctx->timeout_lock);
6622 }
11365043 6623
4e88d6e7 6624 if (ret < 0)
93d2bcd2 6625 req_set_fail(req);
505657bc 6626 io_req_complete_post(req, ret, 0);
11365043 6627 return 0;
5262f567
JA
6628}
6629
3529d8c2 6630static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 6631 bool is_timeout_link)
5262f567 6632{
ad8a48ac 6633 struct io_timeout_data *data;
a41525ab 6634 unsigned flags;
56080b02 6635 u32 off = READ_ONCE(sqe->off);
5262f567 6636
ad8a48ac 6637 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5262f567 6638 return -EINVAL;
26578cda
PB
6639 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6640 sqe->splice_fd_in)
a41525ab 6641 return -EINVAL;
56080b02 6642 if (off && is_timeout_link)
2d28390a 6643 return -EINVAL;
a41525ab 6644 flags = READ_ONCE(sqe->timeout_flags);
6224590d
PB
6645 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
6646 IORING_TIMEOUT_ETIME_SUCCESS))
50c1df2b
JA
6647 return -EINVAL;
6648 /* more than one clock specified is invalid, obviously */
6649 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
5262f567 6650 return -EINVAL;
bdf20073 6651
ef9dd637 6652 INIT_LIST_HEAD(&req->timeout.list);
bfe68a22 6653 req->timeout.off = off;
f18ee4cf
PB
6654 if (unlikely(off && !req->ctx->off_timeout_used))
6655 req->ctx->off_timeout_used = true;
26a61679 6656
d6a644a7
PB
6657 if (WARN_ON_ONCE(req_has_async_data(req)))
6658 return -EFAULT;
6659 if (io_alloc_async_data(req))
26a61679
JA
6660 return -ENOMEM;
6661
e8c2bc1f 6662 data = req->async_data;
ad8a48ac 6663 data->req = req;
50c1df2b 6664 data->flags = flags;
ad8a48ac
JA
6665
6666 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
6667 return -EFAULT;
6668
f6223ff7
YB
6669 if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
6670 return -EINVAL;
6671
e677edbc 6672 INIT_LIST_HEAD(&req->timeout.list);
8662daec 6673 data->mode = io_translate_timeout_mode(flags);
50c1df2b 6674 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
b97e736a
PB
6675
6676 if (is_timeout_link) {
6677 struct io_submit_link *link = &req->ctx->submit_state.link;
6678
6679 if (!link->head)
6680 return -EINVAL;
6681 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6682 return -EINVAL;
4d13d1a4
PB
6683 req->timeout.head = link->last;
6684 link->last->flags |= REQ_F_ARM_LTIMEOUT;
b97e736a 6685 }
ad8a48ac
JA
6686 return 0;
6687}
6688
61e98203 6689static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
ad8a48ac 6690{
ad8a48ac 6691 struct io_ring_ctx *ctx = req->ctx;
e8c2bc1f 6692 struct io_timeout_data *data = req->async_data;
ad8a48ac 6693 struct list_head *entry;
bfe68a22 6694 u32 tail, off = req->timeout.off;
ad8a48ac 6695
89850fce 6696 spin_lock_irq(&ctx->timeout_lock);
93bd25bb 6697
5262f567
JA
6698 /*
6699 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
6700 * timeout event to be satisfied. If it isn't set, then this is
6701 * a pure timeout request, sequence isn't used.
5262f567 6702 */
8eb7e2d0 6703 if (io_is_timeout_noseq(req)) {
93bd25bb
JA
6704 entry = ctx->timeout_list.prev;
6705 goto add;
6706 }
5262f567 6707
bfe68a22
PB
6708 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6709 req->timeout.target_seq = tail + off;
5262f567 6710
f010505b
MDG
6711 /* Update the last seq here in case io_flush_timeouts() hasn't.
6712 * This is safe because ->completion_lock is held, and submissions
6713 * and completions are never mixed in the same ->completion_lock section.
6714 */
6715 ctx->cq_last_tm_flush = tail;
6716
5262f567
JA
6717 /*
6718 * Insertion sort, ensuring the first entry in the list is always
6719 * the one we need first.
6720 */
5262f567 6721 list_for_each_prev(entry, &ctx->timeout_list) {
135fcde8
PB
6722 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6723 timeout.list);
5262f567 6724
8eb7e2d0 6725 if (io_is_timeout_noseq(nxt))
93bd25bb 6726 continue;
bfe68a22
PB
6727 /* nxt.seq is behind @tail, otherwise would've been completed */
6728 if (off >= nxt->timeout.target_seq - tail)
5262f567
JA
6729 break;
6730 }
93bd25bb 6731add:
135fcde8 6732 list_add(&req->timeout.list, entry);
ad8a48ac
JA
6733 data->timer.function = io_timeout_fn;
6734 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
89850fce 6735 spin_unlock_irq(&ctx->timeout_lock);
5262f567
JA
6736 return 0;
6737}
5262f567 6738
f458dd84
PB
6739struct io_cancel_data {
6740 struct io_ring_ctx *ctx;
6741 u64 user_data;
6742};
6743
62755e35
JA
6744static bool io_cancel_cb(struct io_wq_work *work, void *data)
6745{
6746 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f458dd84 6747 struct io_cancel_data *cd = data;
62755e35 6748
cef216fc 6749 return req->ctx == cd->ctx && req->cqe.user_data == cd->user_data;
62755e35
JA
6750}
6751
f458dd84
PB
6752static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6753 struct io_ring_ctx *ctx)
62755e35 6754{
f458dd84 6755 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
62755e35 6756 enum io_wq_cancel cancel_ret;
62755e35
JA
6757 int ret = 0;
6758
f458dd84 6759 if (!tctx || !tctx->io_wq)
5aa75ed5
JA
6760 return -ENOENT;
6761
f458dd84 6762 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
62755e35
JA
6763 switch (cancel_ret) {
6764 case IO_WQ_CANCEL_OK:
6765 ret = 0;
6766 break;
6767 case IO_WQ_CANCEL_RUNNING:
6768 ret = -EALREADY;
6769 break;
6770 case IO_WQ_CANCEL_NOTFOUND:
6771 ret = -ENOENT;
6772 break;
6773 }
6774
e977d6d3
JA
6775 return ret;
6776}
6777
8cb01fac 6778static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
47f46768 6779{
8cb01fac 6780 struct io_ring_ctx *ctx = req->ctx;
47f46768
JA
6781 int ret;
6782
dadebc35 6783 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
8cb01fac 6784
f458dd84 6785 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
ccbf7261
JA
6786 /*
6787 * Fall-through even for -EALREADY, as we may have poll armed
6788 * that need unarming.
6789 */
6790 if (!ret)
6791 return 0;
505657bc
PB
6792
6793 spin_lock(&ctx->completion_lock);
ccbf7261
JA
6794 ret = io_poll_cancel(ctx, sqe_addr, false);
6795 if (ret != -ENOENT)
6796 goto out;
6797
79ebeaee 6798 spin_lock_irq(&ctx->timeout_lock);
47f46768 6799 ret = io_timeout_cancel(ctx, sqe_addr);
79ebeaee 6800 spin_unlock_irq(&ctx->timeout_lock);
505657bc
PB
6801out:
6802 spin_unlock(&ctx->completion_lock);
6803 return ret;
47f46768
JA
6804}
6805
3529d8c2
JA
6806static int io_async_cancel_prep(struct io_kiocb *req,
6807 const struct io_uring_sqe *sqe)
e977d6d3 6808{
fbf23849 6809 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
e977d6d3 6810 return -EINVAL;
61710e43
DA
6811 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6812 return -EINVAL;
26578cda
PB
6813 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6814 sqe->splice_fd_in)
e977d6d3
JA
6815 return -EINVAL;
6816
fbf23849
JA
6817 req->cancel.addr = READ_ONCE(sqe->addr);
6818 return 0;
6819}
6820
61e98203 6821static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6822{
6823 struct io_ring_ctx *ctx = req->ctx;
58f99373
PB
6824 u64 sqe_addr = req->cancel.addr;
6825 struct io_tctx_node *node;
6826 int ret;
6827
8cb01fac 6828 ret = io_try_cancel_userdata(req, sqe_addr);
58f99373
PB
6829 if (ret != -ENOENT)
6830 goto done;
58f99373
PB
6831
6832 /* slow path, try all io-wq's */
f8929630 6833 io_ring_submit_lock(ctx, issue_flags);
58f99373
PB
6834 ret = -ENOENT;
6835 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6836 struct io_uring_task *tctx = node->task->io_uring;
fbf23849 6837
58f99373
PB
6838 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6839 if (ret != -ENOENT)
6840 break;
6841 }
f8929630 6842 io_ring_submit_unlock(ctx, issue_flags);
58f99373 6843done:
58f99373 6844 if (ret < 0)
93d2bcd2 6845 req_set_fail(req);
505657bc 6846 io_req_complete_post(req, ret, 0);
5262f567
JA
6847 return 0;
6848}
6849
269bbe5f 6850static int io_rsrc_update_prep(struct io_kiocb *req,
05f3fb3c
JA
6851 const struct io_uring_sqe *sqe)
6852{
61710e43
DA
6853 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6854 return -EINVAL;
26578cda 6855 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
05f3fb3c
JA
6856 return -EINVAL;
6857
269bbe5f
BM
6858 req->rsrc_update.offset = READ_ONCE(sqe->off);
6859 req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6860 if (!req->rsrc_update.nr_args)
05f3fb3c 6861 return -EINVAL;
269bbe5f 6862 req->rsrc_update.arg = READ_ONCE(sqe->addr);
05f3fb3c
JA
6863 return 0;
6864}
6865
889fca73 6866static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
6867{
6868 struct io_ring_ctx *ctx = req->ctx;
c3bdad02 6869 struct io_uring_rsrc_update2 up;
05f3fb3c 6870 int ret;
fbf23849 6871
269bbe5f
BM
6872 up.offset = req->rsrc_update.offset;
6873 up.data = req->rsrc_update.arg;
c3bdad02
PB
6874 up.nr = 0;
6875 up.tags = 0;
615cee49 6876 up.resv = 0;
d8a3ba9c 6877 up.resv2 = 0;
05f3fb3c 6878
f8929630 6879 io_ring_submit_lock(ctx, issue_flags);
fdecb662 6880 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
98f0b3b4 6881 &up, req->rsrc_update.nr_args);
f8929630 6882 io_ring_submit_unlock(ctx, issue_flags);
05f3fb3c
JA
6883
6884 if (ret < 0)
93d2bcd2 6885 req_set_fail(req);
889fca73 6886 __io_req_complete(req, issue_flags, ret, 0);
5262f567
JA
6887 return 0;
6888}
6889
bfe76559 6890static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 6891{
d625c6ee 6892 switch (req->opcode) {
e781573e 6893 case IORING_OP_NOP:
bfe76559 6894 return 0;
f67676d1
JA
6895 case IORING_OP_READV:
6896 case IORING_OP_READ_FIXED:
3a6820f2 6897 case IORING_OP_READ:
f67676d1
JA
6898 case IORING_OP_WRITEV:
6899 case IORING_OP_WRITE_FIXED:
3a6820f2 6900 case IORING_OP_WRITE:
584b0180 6901 return io_prep_rw(req, sqe);
0969e783 6902 case IORING_OP_POLL_ADD:
bfe76559 6903 return io_poll_add_prep(req, sqe);
0969e783 6904 case IORING_OP_POLL_REMOVE:
c5de0036 6905 return io_poll_update_prep(req, sqe);
8ed8d3c3 6906 case IORING_OP_FSYNC:
1155c76a 6907 return io_fsync_prep(req, sqe);
8ed8d3c3 6908 case IORING_OP_SYNC_FILE_RANGE:
1155c76a 6909 return io_sfr_prep(req, sqe);
03b1230c 6910 case IORING_OP_SENDMSG:
fddaface 6911 case IORING_OP_SEND:
bfe76559 6912 return io_sendmsg_prep(req, sqe);
03b1230c 6913 case IORING_OP_RECVMSG:
fddaface 6914 case IORING_OP_RECV:
bfe76559 6915 return io_recvmsg_prep(req, sqe);
f499a021 6916 case IORING_OP_CONNECT:
bfe76559 6917 return io_connect_prep(req, sqe);
2d28390a 6918 case IORING_OP_TIMEOUT:
bfe76559 6919 return io_timeout_prep(req, sqe, false);
b29472ee 6920 case IORING_OP_TIMEOUT_REMOVE:
bfe76559 6921 return io_timeout_remove_prep(req, sqe);
fbf23849 6922 case IORING_OP_ASYNC_CANCEL:
bfe76559 6923 return io_async_cancel_prep(req, sqe);
2d28390a 6924 case IORING_OP_LINK_TIMEOUT:
bfe76559 6925 return io_timeout_prep(req, sqe, true);
8ed8d3c3 6926 case IORING_OP_ACCEPT:
bfe76559 6927 return io_accept_prep(req, sqe);
d63d1b5e 6928 case IORING_OP_FALLOCATE:
bfe76559 6929 return io_fallocate_prep(req, sqe);
15b71abe 6930 case IORING_OP_OPENAT:
bfe76559 6931 return io_openat_prep(req, sqe);
b5dba59e 6932 case IORING_OP_CLOSE:
bfe76559 6933 return io_close_prep(req, sqe);
05f3fb3c 6934 case IORING_OP_FILES_UPDATE:
269bbe5f 6935 return io_rsrc_update_prep(req, sqe);
eddc7ef5 6936 case IORING_OP_STATX:
bfe76559 6937 return io_statx_prep(req, sqe);
4840e418 6938 case IORING_OP_FADVISE:
bfe76559 6939 return io_fadvise_prep(req, sqe);
c1ca757b 6940 case IORING_OP_MADVISE:
bfe76559 6941 return io_madvise_prep(req, sqe);
cebdb986 6942 case IORING_OP_OPENAT2:
bfe76559 6943 return io_openat2_prep(req, sqe);
3e4827b0 6944 case IORING_OP_EPOLL_CTL:
bfe76559 6945 return io_epoll_ctl_prep(req, sqe);
7d67af2c 6946 case IORING_OP_SPLICE:
bfe76559 6947 return io_splice_prep(req, sqe);
ddf0322d 6948 case IORING_OP_PROVIDE_BUFFERS:
bfe76559 6949 return io_provide_buffers_prep(req, sqe);
067524e9 6950 case IORING_OP_REMOVE_BUFFERS:
bfe76559 6951 return io_remove_buffers_prep(req, sqe);
f2a8d5c7 6952 case IORING_OP_TEE:
bfe76559 6953 return io_tee_prep(req, sqe);
36f4fa68
JA
6954 case IORING_OP_SHUTDOWN:
6955 return io_shutdown_prep(req, sqe);
80a261fd
JA
6956 case IORING_OP_RENAMEAT:
6957 return io_renameat_prep(req, sqe);
14a1143b
JA
6958 case IORING_OP_UNLINKAT:
6959 return io_unlinkat_prep(req, sqe);
e34a02dc
DK
6960 case IORING_OP_MKDIRAT:
6961 return io_mkdirat_prep(req, sqe);
7a8721f8
DK
6962 case IORING_OP_SYMLINKAT:
6963 return io_symlinkat_prep(req, sqe);
cf30da90
DK
6964 case IORING_OP_LINKAT:
6965 return io_linkat_prep(req, sqe);
4f57f06c
JA
6966 case IORING_OP_MSG_RING:
6967 return io_msg_ring_prep(req, sqe);
f67676d1
JA
6968 }
6969
bfe76559
PB
6970 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6971 req->opcode);
bd54b6fe 6972 return -EINVAL;
bfe76559
PB
6973}
6974
93642ef8 6975static int io_req_prep_async(struct io_kiocb *req)
bfe76559 6976{
b7e298d2
PB
6977 if (!io_op_defs[req->opcode].needs_async_setup)
6978 return 0;
d886e185 6979 if (WARN_ON_ONCE(req_has_async_data(req)))
b7e298d2
PB
6980 return -EFAULT;
6981 if (io_alloc_async_data(req))
6982 return -EAGAIN;
6983
93642ef8
PB
6984 switch (req->opcode) {
6985 case IORING_OP_READV:
93642ef8
PB
6986 return io_rw_prep_async(req, READ);
6987 case IORING_OP_WRITEV:
93642ef8
PB
6988 return io_rw_prep_async(req, WRITE);
6989 case IORING_OP_SENDMSG:
93642ef8
PB
6990 return io_sendmsg_prep_async(req);
6991 case IORING_OP_RECVMSG:
93642ef8
PB
6992 return io_recvmsg_prep_async(req);
6993 case IORING_OP_CONNECT:
6994 return io_connect_prep_async(req);
6995 }
b7e298d2
PB
6996 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6997 req->opcode);
6998 return -EFAULT;
f67676d1
JA
6999}
7000
9cf7c104
PB
7001static u32 io_get_sequence(struct io_kiocb *req)
7002{
a3dbdf54 7003 u32 seq = req->ctx->cached_sq_head;
963c6abb 7004 struct io_kiocb *cur;
9cf7c104 7005
a3dbdf54 7006 /* need original cached_sq_head, but it was increased for each req */
963c6abb 7007 io_for_each_link(cur, req)
a3dbdf54
PB
7008 seq--;
7009 return seq;
9cf7c104
PB
7010}
7011
c072481d 7012static __cold void io_drain_req(struct io_kiocb *req)
de0617e4 7013{
a197f664 7014 struct io_ring_ctx *ctx = req->ctx;
27dc8338 7015 struct io_defer_entry *de;
f67676d1 7016 int ret;
e0eb71dc 7017 u32 seq = io_get_sequence(req);
3c19966d 7018
9d858b21 7019 /* Still need defer if there is pending req in defer list. */
e302f104 7020 spin_lock(&ctx->completion_lock);
5e371265 7021 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
e302f104 7022 spin_unlock(&ctx->completion_lock);
e0eb71dc 7023queue:
10c66904 7024 ctx->drain_active = false;
e0eb71dc
PB
7025 io_req_task_queue(req);
7026 return;
10c66904 7027 }
e302f104 7028 spin_unlock(&ctx->completion_lock);
9cf7c104 7029
b7e298d2 7030 ret = io_req_prep_async(req);
e0eb71dc
PB
7031 if (ret) {
7032fail:
7033 io_req_complete_failed(req, ret);
7034 return;
7035 }
cbdcb435 7036 io_prep_async_link(req);
27dc8338 7037 de = kmalloc(sizeof(*de), GFP_KERNEL);
76cc33d7 7038 if (!de) {
1b48773f 7039 ret = -ENOMEM;
e0eb71dc 7040 goto fail;
76cc33d7 7041 }
2d28390a 7042
79ebeaee 7043 spin_lock(&ctx->completion_lock);
9cf7c104 7044 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
79ebeaee 7045 spin_unlock(&ctx->completion_lock);
27dc8338 7046 kfree(de);
e0eb71dc 7047 goto queue;
de0617e4
JA
7048 }
7049
cef216fc 7050 trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
27dc8338 7051 de->req = req;
9cf7c104 7052 de->seq = seq;
27dc8338 7053 list_add_tail(&de->list, &ctx->defer_list);
79ebeaee 7054 spin_unlock(&ctx->completion_lock);
de0617e4
JA
7055}
7056
68fb8979 7057static void io_clean_op(struct io_kiocb *req)
99bc4c38 7058{
8197b053
PB
7059 if (req->flags & REQ_F_BUFFER_SELECTED) {
7060 spin_lock(&req->ctx->completion_lock);
cc3cec83 7061 io_put_kbuf_comp(req);
8197b053
PB
7062 spin_unlock(&req->ctx->completion_lock);
7063 }
99bc4c38 7064
0e1b6fe3
PB
7065 if (req->flags & REQ_F_NEED_CLEANUP) {
7066 switch (req->opcode) {
7067 case IORING_OP_READV:
7068 case IORING_OP_READ_FIXED:
7069 case IORING_OP_READ:
7070 case IORING_OP_WRITEV:
7071 case IORING_OP_WRITE_FIXED:
e8c2bc1f
JA
7072 case IORING_OP_WRITE: {
7073 struct io_async_rw *io = req->async_data;
1dacb4df
PB
7074
7075 kfree(io->free_iovec);
0e1b6fe3 7076 break;
e8c2bc1f 7077 }
0e1b6fe3 7078 case IORING_OP_RECVMSG:
e8c2bc1f
JA
7079 case IORING_OP_SENDMSG: {
7080 struct io_async_msghdr *io = req->async_data;
257e84a5
PB
7081
7082 kfree(io->free_iov);
0e1b6fe3 7083 break;
e8c2bc1f 7084 }
f3cd4850
JA
7085 case IORING_OP_OPENAT:
7086 case IORING_OP_OPENAT2:
7087 if (req->open.filename)
7088 putname(req->open.filename);
7089 break;
80a261fd
JA
7090 case IORING_OP_RENAMEAT:
7091 putname(req->rename.oldpath);
7092 putname(req->rename.newpath);
7093 break;
14a1143b
JA
7094 case IORING_OP_UNLINKAT:
7095 putname(req->unlink.filename);
7096 break;
e34a02dc
DK
7097 case IORING_OP_MKDIRAT:
7098 putname(req->mkdir.filename);
7099 break;
7a8721f8
DK
7100 case IORING_OP_SYMLINKAT:
7101 putname(req->symlink.oldpath);
7102 putname(req->symlink.newpath);
7103 break;
cf30da90
DK
7104 case IORING_OP_LINKAT:
7105 putname(req->hardlink.oldpath);
7106 putname(req->hardlink.newpath);
7107 break;
1b6fe6e0
SR
7108 case IORING_OP_STATX:
7109 if (req->statx.filename)
7110 putname(req->statx.filename);
7111 break;
0e1b6fe3 7112 }
99bc4c38 7113 }
75652a30
JA
7114 if ((req->flags & REQ_F_POLLED) && req->apoll) {
7115 kfree(req->apoll->double_poll);
7116 kfree(req->apoll);
7117 req->apoll = NULL;
7118 }
c854357b 7119 if (req->flags & REQ_F_CREDS)
b8e64b53 7120 put_cred(req->creds);
d886e185
PB
7121 if (req->flags & REQ_F_ASYNC_DATA) {
7122 kfree(req->async_data);
7123 req->async_data = NULL;
7124 }
c854357b 7125 req->flags &= ~IO_REQ_CLEAN_FLAGS;
99bc4c38
PB
7126}
7127
6bf9c47a
JA
7128static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
7129{
7130 if (req->file || !io_op_defs[req->opcode].needs_file)
7131 return true;
7132
7133 if (req->flags & REQ_F_FIXED_FILE)
cef216fc 7134 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
6bf9c47a 7135 else
cef216fc 7136 req->file = io_file_get_normal(req, req->cqe.fd);
6bf9c47a
JA
7137 if (req->file)
7138 return true;
7139
7140 req_set_fail(req);
cef216fc 7141 req->cqe.res = -EBADF;
6bf9c47a
JA
7142 return false;
7143}
7144
889fca73 7145static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 7146{
5730b27e 7147 const struct cred *creds = NULL;
d625c6ee 7148 int ret;
2b188cc1 7149
70152140
JA
7150 if (unlikely(!io_assign_file(req, issue_flags)))
7151 return -EBADF;
7152
6878b40e 7153 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
c10d1f98 7154 creds = override_creds(req->creds);
5730b27e 7155
5bd2182d
PM
7156 if (!io_op_defs[req->opcode].audit_skip)
7157 audit_uring_entry(req->opcode);
7158
d625c6ee 7159 switch (req->opcode) {
2b188cc1 7160 case IORING_OP_NOP:
889fca73 7161 ret = io_nop(req, issue_flags);
2b188cc1
JA
7162 break;
7163 case IORING_OP_READV:
edafccee 7164 case IORING_OP_READ_FIXED:
3a6820f2 7165 case IORING_OP_READ:
889fca73 7166 ret = io_read(req, issue_flags);
edafccee 7167 break;
3529d8c2 7168 case IORING_OP_WRITEV:
edafccee 7169 case IORING_OP_WRITE_FIXED:
3a6820f2 7170 case IORING_OP_WRITE:
889fca73 7171 ret = io_write(req, issue_flags);
2b188cc1 7172 break;
c992fe29 7173 case IORING_OP_FSYNC:
45d189c6 7174 ret = io_fsync(req, issue_flags);
c992fe29 7175 break;
221c5eb2 7176 case IORING_OP_POLL_ADD:
61e98203 7177 ret = io_poll_add(req, issue_flags);
221c5eb2
JA
7178 break;
7179 case IORING_OP_POLL_REMOVE:
c5de0036 7180 ret = io_poll_update(req, issue_flags);
221c5eb2 7181 break;
5d17b4a4 7182 case IORING_OP_SYNC_FILE_RANGE:
45d189c6 7183 ret = io_sync_file_range(req, issue_flags);
5d17b4a4 7184 break;
0fa03c62 7185 case IORING_OP_SENDMSG:
889fca73 7186 ret = io_sendmsg(req, issue_flags);
062d04d7 7187 break;
fddaface 7188 case IORING_OP_SEND:
889fca73 7189 ret = io_send(req, issue_flags);
0fa03c62 7190 break;
aa1fa28f 7191 case IORING_OP_RECVMSG:
889fca73 7192 ret = io_recvmsg(req, issue_flags);
062d04d7 7193 break;
fddaface 7194 case IORING_OP_RECV:
889fca73 7195 ret = io_recv(req, issue_flags);
aa1fa28f 7196 break;
5262f567 7197 case IORING_OP_TIMEOUT:
61e98203 7198 ret = io_timeout(req, issue_flags);
5262f567 7199 break;
11365043 7200 case IORING_OP_TIMEOUT_REMOVE:
61e98203 7201 ret = io_timeout_remove(req, issue_flags);
11365043 7202 break;
17f2fe35 7203 case IORING_OP_ACCEPT:
889fca73 7204 ret = io_accept(req, issue_flags);
17f2fe35 7205 break;
f8e85cf2 7206 case IORING_OP_CONNECT:
889fca73 7207 ret = io_connect(req, issue_flags);
f8e85cf2 7208 break;
62755e35 7209 case IORING_OP_ASYNC_CANCEL:
61e98203 7210 ret = io_async_cancel(req, issue_flags);
62755e35 7211 break;
d63d1b5e 7212 case IORING_OP_FALLOCATE:
45d189c6 7213 ret = io_fallocate(req, issue_flags);
d63d1b5e 7214 break;
15b71abe 7215 case IORING_OP_OPENAT:
45d189c6 7216 ret = io_openat(req, issue_flags);
15b71abe 7217 break;
b5dba59e 7218 case IORING_OP_CLOSE:
889fca73 7219 ret = io_close(req, issue_flags);
b5dba59e 7220 break;
05f3fb3c 7221 case IORING_OP_FILES_UPDATE:
889fca73 7222 ret = io_files_update(req, issue_flags);
05f3fb3c 7223 break;
eddc7ef5 7224 case IORING_OP_STATX:
45d189c6 7225 ret = io_statx(req, issue_flags);
eddc7ef5 7226 break;
4840e418 7227 case IORING_OP_FADVISE:
45d189c6 7228 ret = io_fadvise(req, issue_flags);
4840e418 7229 break;
c1ca757b 7230 case IORING_OP_MADVISE:
45d189c6 7231 ret = io_madvise(req, issue_flags);
c1ca757b 7232 break;
cebdb986 7233 case IORING_OP_OPENAT2:
45d189c6 7234 ret = io_openat2(req, issue_flags);
cebdb986 7235 break;
3e4827b0 7236 case IORING_OP_EPOLL_CTL:
889fca73 7237 ret = io_epoll_ctl(req, issue_flags);
3e4827b0 7238 break;
7d67af2c 7239 case IORING_OP_SPLICE:
45d189c6 7240 ret = io_splice(req, issue_flags);
7d67af2c 7241 break;
ddf0322d 7242 case IORING_OP_PROVIDE_BUFFERS:
889fca73 7243 ret = io_provide_buffers(req, issue_flags);
ddf0322d 7244 break;
067524e9 7245 case IORING_OP_REMOVE_BUFFERS:
889fca73 7246 ret = io_remove_buffers(req, issue_flags);
3e4827b0 7247 break;
f2a8d5c7 7248 case IORING_OP_TEE:
45d189c6 7249 ret = io_tee(req, issue_flags);
f2a8d5c7 7250 break;
36f4fa68 7251 case IORING_OP_SHUTDOWN:
45d189c6 7252 ret = io_shutdown(req, issue_flags);
36f4fa68 7253 break;
80a261fd 7254 case IORING_OP_RENAMEAT:
45d189c6 7255 ret = io_renameat(req, issue_flags);
80a261fd 7256 break;
14a1143b 7257 case IORING_OP_UNLINKAT:
45d189c6 7258 ret = io_unlinkat(req, issue_flags);
14a1143b 7259 break;
e34a02dc
DK
7260 case IORING_OP_MKDIRAT:
7261 ret = io_mkdirat(req, issue_flags);
7262 break;
7a8721f8
DK
7263 case IORING_OP_SYMLINKAT:
7264 ret = io_symlinkat(req, issue_flags);
7265 break;
cf30da90
DK
7266 case IORING_OP_LINKAT:
7267 ret = io_linkat(req, issue_flags);
7268 break;
4f57f06c
JA
7269 case IORING_OP_MSG_RING:
7270 ret = io_msg_ring(req, issue_flags);
7271 break;
2b188cc1
JA
7272 default:
7273 ret = -EINVAL;
7274 break;
7275 }
7276
5bd2182d
PM
7277 if (!io_op_defs[req->opcode].audit_skip)
7278 audit_uring_exit(!ret, ret);
7279
5730b27e
JA
7280 if (creds)
7281 revert_creds(creds);
def596e9
JA
7282 if (ret)
7283 return ret;
b532576e 7284 /* If the op doesn't have a file, we're not polling for it */
9983028e 7285 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
9882131c 7286 io_iopoll_req_issued(req, issue_flags);
def596e9
JA
7287
7288 return 0;
2b188cc1
JA
7289}
7290
ebc11b6c
PB
7291static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
7292{
7293 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7294
7295 req = io_put_req_find_next(req);
7296 return req ? &req->work : NULL;
7297}
7298
5280f7e5 7299static void io_wq_submit_work(struct io_wq_work *work)
2b188cc1
JA
7300{
7301 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6bf9c47a 7302 const struct io_op_def *def = &io_op_defs[req->opcode];
d01905db
PB
7303 unsigned int issue_flags = IO_URING_F_UNLOCKED;
7304 bool needs_poll = false;
6bf9c47a 7305 int ret = 0, err = -ECANCELED;
2b188cc1 7306
48dcd38d
PB
7307 /* one will be dropped by ->io_free_work() after returning to io-wq */
7308 if (!(req->flags & REQ_F_REFCOUNT))
7309 __io_req_set_refcount(req, 2);
7310 else
7311 req_ref_get(req);
5d5901a3 7312
cb2d344c 7313 io_arm_ltimeout(req);
6bf9c47a 7314
dadebc35 7315 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
d01905db 7316 if (work->flags & IO_WQ_WORK_CANCEL) {
0f8da75b 7317fail:
6bf9c47a 7318 io_req_task_queue_fail(req, err);
d01905db
PB
7319 return;
7320 }
0f8da75b
PB
7321 if (!io_assign_file(req, issue_flags)) {
7322 err = -EBADF;
7323 work->flags |= IO_WQ_WORK_CANCEL;
7324 goto fail;
7325 }
31b51510 7326
d01905db 7327 if (req->flags & REQ_F_FORCE_ASYNC) {
afb7f56f
PB
7328 bool opcode_poll = def->pollin || def->pollout;
7329
7330 if (opcode_poll && file_can_poll(req->file)) {
7331 needs_poll = true;
d01905db 7332 issue_flags |= IO_URING_F_NONBLOCK;
afb7f56f 7333 }
561fb04a 7334 }
31b51510 7335
d01905db
PB
7336 do {
7337 ret = io_issue_sqe(req, issue_flags);
7338 if (ret != -EAGAIN)
7339 break;
7340 /*
7341 * We can get EAGAIN for iopolled IO even though we're
7342 * forcing a sync submission from here, since we can't
7343 * wait for request slots on the block side.
7344 */
7345 if (!needs_poll) {
7346 cond_resched();
7347 continue;
90fa0288
HX
7348 }
7349
4d9237e3 7350 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
d01905db
PB
7351 return;
7352 /* aborted or ready, in either case retry blocking */
7353 needs_poll = false;
7354 issue_flags &= ~IO_URING_F_NONBLOCK;
7355 } while (1);
31b51510 7356
a3df7698 7357 /* avoid locking problems by failing it from a clean context */
5d5901a3 7358 if (ret)
a3df7698 7359 io_req_task_queue_fail(req, ret);
2b188cc1
JA
7360}
7361
aeca241b 7362static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
042b0d85 7363 unsigned i)
65e19f54 7364{
042b0d85 7365 return &table->files[i];
dafecf19
PB
7366}
7367
65e19f54
JA
7368static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
7369 int index)
7370{
aeca241b 7371 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
65e19f54 7372
a04b0ac0 7373 return (struct file *) (slot->file_ptr & FFS_MASK);
65e19f54
JA
7374}
7375
a04b0ac0 7376static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
9a321c98
PB
7377{
7378 unsigned long file_ptr = (unsigned long) file;
7379
88459b50 7380 file_ptr |= io_file_get_flags(file);
a04b0ac0 7381 file_slot->file_ptr = file_ptr;
65e19f54
JA
7382}
7383
5106dd6e
JA
7384static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
7385 unsigned int issue_flags)
09bb8394 7386{
5106dd6e
JA
7387 struct io_ring_ctx *ctx = req->ctx;
7388 struct file *file = NULL;
ac177053 7389 unsigned long file_ptr;
09bb8394 7390
93f052cb 7391 io_ring_submit_lock(ctx, issue_flags);
5106dd6e 7392
ac177053 7393 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
5106dd6e 7394 goto out;
ac177053
PB
7395 fd = array_index_nospec(fd, ctx->nr_user_files);
7396 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
7397 file = (struct file *) (file_ptr & FFS_MASK);
7398 file_ptr &= ~FFS_MASK;
7399 /* mask in overlapping REQ_F and FFS bits */
35645ac3 7400 req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
5106dd6e
JA
7401 io_req_set_rsrc_node(req, ctx, 0);
7402out:
93f052cb 7403 io_ring_submit_unlock(ctx, issue_flags);
ac177053
PB
7404 return file;
7405}
d44f554e 7406
d5361233
JA
7407/*
7408 * Drop the file for requeue operations. Only used of req->file is the
7409 * io_uring descriptor itself.
7410 */
7411static void io_drop_inflight_file(struct io_kiocb *req)
7412{
7413 if (unlikely(req->flags & REQ_F_INFLIGHT)) {
7414 fput(req->file);
7415 req->file = NULL;
7416 req->flags &= ~REQ_F_INFLIGHT;
7417 }
7418}
7419
5106dd6e 7420static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
ac177053 7421{
62906e89 7422 struct file *file = fget(fd);
ac177053 7423
cef216fc 7424 trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
09bb8394 7425
ac177053 7426 /* we don't allow fixed io_uring files */
d5361233
JA
7427 if (file && file->f_op == &io_uring_fops)
7428 req->flags |= REQ_F_INFLIGHT;
8371adf5 7429 return file;
09bb8394
JA
7430}
7431
f237c30a 7432static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
89b263f6
JA
7433{
7434 struct io_kiocb *prev = req->timeout.prev;
617a8948 7435 int ret = -ENOENT;
89b263f6
JA
7436
7437 if (prev) {
617a8948 7438 if (!(req->task->flags & PF_EXITING))
cef216fc 7439 ret = io_try_cancel_userdata(req, prev->cqe.user_data);
505657bc 7440 io_req_complete_post(req, ret ?: -ETIME, 0);
89b263f6 7441 io_put_req(prev);
89b263f6
JA
7442 } else {
7443 io_req_complete_post(req, -ETIME, 0);
7444 }
7445}
7446
2665abfd 7447static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 7448{
ad8a48ac
JA
7449 struct io_timeout_data *data = container_of(timer,
7450 struct io_timeout_data, timer);
90cd7e42 7451 struct io_kiocb *prev, *req = data->req;
2665abfd 7452 struct io_ring_ctx *ctx = req->ctx;
2665abfd 7453 unsigned long flags;
2665abfd 7454
89b263f6 7455 spin_lock_irqsave(&ctx->timeout_lock, flags);
90cd7e42
PB
7456 prev = req->timeout.head;
7457 req->timeout.head = NULL;
2665abfd
JA
7458
7459 /*
7460 * We don't expect the list to be empty, that will only happen if we
7461 * race with the completion of the linked work.
7462 */
447c19f3 7463 if (prev) {
f2f87370 7464 io_remove_next_linked(prev);
447c19f3
PB
7465 if (!req_ref_inc_not_zero(prev))
7466 prev = NULL;
7467 }
ef9dd637 7468 list_del(&req->timeout.list);
89b263f6
JA
7469 req->timeout.prev = prev;
7470 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
2665abfd 7471
89b263f6 7472 req->io_task_work.func = io_req_task_link_timeout;
4813c377 7473 io_req_task_work_add(req, false);
2665abfd
JA
7474 return HRTIMER_NORESTART;
7475}
7476
de968c18 7477static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 7478{
de968c18
PB
7479 struct io_ring_ctx *ctx = req->ctx;
7480
89b263f6 7481 spin_lock_irq(&ctx->timeout_lock);
76a46e06 7482 /*
f2f87370
PB
7483 * If the back reference is NULL, then our linked request finished
7484 * before we got a chance to setup the timer
76a46e06 7485 */
90cd7e42 7486 if (req->timeout.head) {
e8c2bc1f 7487 struct io_timeout_data *data = req->async_data;
94ae5e77 7488
ad8a48ac
JA
7489 data->timer.function = io_link_timeout_fn;
7490 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
7491 data->mode);
ef9dd637 7492 list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
2665abfd 7493 }
89b263f6 7494 spin_unlock_irq(&ctx->timeout_lock);
2665abfd 7495 /* drop submission reference */
76a46e06
JA
7496 io_put_req(req);
7497}
2665abfd 7498
7bfa9bad 7499static void io_queue_async(struct io_kiocb *req, int ret)
d475a9a6
PB
7500 __must_hold(&req->ctx->uring_lock)
7501{
7bfa9bad
PB
7502 struct io_kiocb *linked_timeout;
7503
7504 if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
7505 io_req_complete_failed(req, ret);
7506 return;
7507 }
7508
7509 linked_timeout = io_prep_linked_timeout(req);
d475a9a6 7510
4d9237e3 7511 switch (io_arm_poll_handler(req, 0)) {
d475a9a6 7512 case IO_APOLL_READY:
d475a9a6
PB
7513 io_req_task_queue(req);
7514 break;
7515 case IO_APOLL_ABORTED:
7516 /*
7517 * Queued up for async execution, worker will release
7518 * submit reference when the iocb is actually submitted.
7519 */
77955efb 7520 io_queue_iowq(req, NULL);
d475a9a6 7521 break;
b1c62645 7522 case IO_APOLL_OK:
b1c62645 7523 break;
d475a9a6
PB
7524 }
7525
7526 if (linked_timeout)
7527 io_queue_linked_timeout(linked_timeout);
7528}
7529
cbc2e203 7530static inline void io_queue_sqe(struct io_kiocb *req)
282cdc86 7531 __must_hold(&req->ctx->uring_lock)
2b188cc1 7532{
e0c5c576 7533 int ret;
2b188cc1 7534
c5eef2b9 7535 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
193155c8 7536
fff4e40e
PB
7537 if (req->flags & REQ_F_COMPLETE_INLINE) {
7538 io_req_add_compl_list(req);
d9f9d284 7539 return;
fff4e40e 7540 }
491381ce
JA
7541 /*
7542 * We async punt it if the file wasn't marked NOWAIT, or if the file
7543 * doesn't support non-blocking read/write attempts
7544 */
7bfa9bad 7545 if (likely(!ret))
cb2d344c 7546 io_arm_ltimeout(req);
7bfa9bad
PB
7547 else
7548 io_queue_async(req, ret);
2b188cc1
JA
7549}
7550
4652fe3f 7551static void io_queue_sqe_fallback(struct io_kiocb *req)
282cdc86 7552 __must_hold(&req->ctx->uring_lock)
4fe2c963 7553{
17b147f6
PB
7554 if (unlikely(req->flags & REQ_F_FAIL)) {
7555 /*
7556 * We don't submit, fail them all, for that replace hardlinks
7557 * with normal links. Extra REQ_F_LINK is tolerated.
7558 */
7559 req->flags &= ~REQ_F_HARDLINK;
7560 req->flags |= REQ_F_LINK;
7561 io_req_complete_failed(req, req->cqe.res);
e0eb71dc
PB
7562 } else if (unlikely(req->ctx->drain_active)) {
7563 io_drain_req(req);
76cc33d7
PB
7564 } else {
7565 int ret = io_req_prep_async(req);
7566
7567 if (unlikely(ret))
7568 io_req_complete_failed(req, ret);
7569 else
77955efb 7570 io_queue_iowq(req, NULL);
ce35a47a 7571 }
4fe2c963
JL
7572}
7573
b16fed66
PB
7574/*
7575 * Check SQE restrictions (opcode and flags).
7576 *
7577 * Returns 'true' if SQE is allowed, 'false' otherwise.
7578 */
7579static inline bool io_check_restriction(struct io_ring_ctx *ctx,
7580 struct io_kiocb *req,
7581 unsigned int sqe_flags)
4fe2c963 7582{
b16fed66
PB
7583 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
7584 return false;
7585
7586 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
7587 ctx->restrictions.sqe_flags_required)
7588 return false;
7589
7590 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
7591 ctx->restrictions.sqe_flags_required))
7592 return false;
7593
7594 return true;
4fe2c963
JL
7595}
7596
22b2ca31
PB
7597static void io_init_req_drain(struct io_kiocb *req)
7598{
7599 struct io_ring_ctx *ctx = req->ctx;
7600 struct io_kiocb *head = ctx->submit_state.link.head;
7601
7602 ctx->drain_active = true;
7603 if (head) {
7604 /*
7605 * If we need to drain a request in the middle of a link, drain
7606 * the head request and the next request/link after the current
7607 * link. Considering sequential execution of links,
b6c7db32 7608 * REQ_F_IO_DRAIN will be maintained for every request of our
22b2ca31
PB
7609 * link.
7610 */
b6c7db32 7611 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
22b2ca31
PB
7612 ctx->drain_next = true;
7613 }
7614}
7615
b16fed66
PB
7616static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
7617 const struct io_uring_sqe *sqe)
282cdc86 7618 __must_hold(&ctx->uring_lock)
b16fed66 7619{
b16fed66 7620 unsigned int sqe_flags;
fc0ae024 7621 int personality;
4a04d1d1 7622 u8 opcode;
b16fed66 7623
864ea921 7624 /* req is partially pre-initialised, see io_preinit_req() */
4a04d1d1 7625 req->opcode = opcode = READ_ONCE(sqe->opcode);
b16fed66
PB
7626 /* same numerical values with corresponding REQ_F_*, safe to copy */
7627 req->flags = sqe_flags = READ_ONCE(sqe->flags);
cef216fc 7628 req->cqe.user_data = READ_ONCE(sqe->user_data);
b16fed66 7629 req->file = NULL;
b16fed66 7630 req->fixed_rsrc_refs = NULL;
b16fed66 7631 req->task = current;
b16fed66 7632
4a04d1d1
PB
7633 if (unlikely(opcode >= IORING_OP_LAST)) {
7634 req->opcode = 0;
b16fed66 7635 return -EINVAL;
4a04d1d1 7636 }
68fe256a
PB
7637 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
7638 /* enforce forwards compatibility on users */
7639 if (sqe_flags & ~SQE_VALID_FLAGS)
7640 return -EINVAL;
7641 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
4a04d1d1 7642 !io_op_defs[opcode].buffer_select)
68fe256a 7643 return -EOPNOTSUPP;
5562a8d7
PB
7644 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
7645 ctx->drain_disabled = true;
7646 if (sqe_flags & IOSQE_IO_DRAIN) {
7647 if (ctx->drain_disabled)
7648 return -EOPNOTSUPP;
22b2ca31 7649 io_init_req_drain(req);
5562a8d7 7650 }
2a56a9bd
PB
7651 }
7652 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
7653 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
7654 return -EACCES;
7655 /* knock it to the slow queue path, will be drained there */
7656 if (ctx->drain_active)
7657 req->flags |= REQ_F_FORCE_ASYNC;
7658 /* if there is no link, we're at "next" request and need to drain */
7659 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
7660 ctx->drain_next = false;
7661 ctx->drain_active = true;
b6c7db32 7662 req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
2a56a9bd 7663 }
68fe256a 7664 }
b16fed66 7665
4a04d1d1 7666 if (io_op_defs[opcode].needs_file) {
6d63416d
PB
7667 struct io_submit_state *state = &ctx->submit_state;
7668
cef216fc 7669 req->cqe.fd = READ_ONCE(sqe->fd);
6bf9c47a 7670
6d63416d
PB
7671 /*
7672 * Plug now if we have more than 2 IO left after this, and the
7673 * target is potentially a read/write to block based storage.
7674 */
4a04d1d1 7675 if (state->need_plug && io_op_defs[opcode].plug) {
6d63416d
PB
7676 state->plug_started = true;
7677 state->need_plug = false;
5ca7a8b3 7678 blk_start_plug_nr_ios(&state->plug, state->submit_nr);
6d63416d 7679 }
b16fed66 7680 }
863e0560 7681
003e8dcc
JA
7682 personality = READ_ONCE(sqe->personality);
7683 if (personality) {
cdab10bf
LT
7684 int ret;
7685
c10d1f98
PB
7686 req->creds = xa_load(&ctx->personalities, personality);
7687 if (!req->creds)
003e8dcc 7688 return -EINVAL;
c10d1f98 7689 get_cred(req->creds);
cdc1404a
PM
7690 ret = security_uring_override_creds(req->creds);
7691 if (ret) {
7692 put_cred(req->creds);
7693 return ret;
7694 }
b8e64b53 7695 req->flags |= REQ_F_CREDS;
003e8dcc 7696 }
b16fed66 7697
fc0ae024 7698 return io_req_prep(req, sqe);
b16fed66
PB
7699}
7700
df3becde
PB
7701static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
7702 struct io_kiocb *req, int ret)
7703{
7704 struct io_ring_ctx *ctx = req->ctx;
7705 struct io_submit_link *link = &ctx->submit_state.link;
7706 struct io_kiocb *head = link->head;
7707
7708 trace_io_uring_req_failed(sqe, ctx, req, ret);
7709
7710 /*
7711 * Avoid breaking links in the middle as it renders links with SQPOLL
7712 * unusable. Instead of failing eagerly, continue assembling the link if
7713 * applicable and mark the head with REQ_F_FAIL. The link flushing code
7714 * should find the flag and handle the rest.
7715 */
7716 req_fail_link_node(req, ret);
7717 if (head && !(head->flags & REQ_F_FAIL))
7718 req_fail_link_node(head, -ECANCELED);
7719
7720 if (!(req->flags & IO_REQ_LINK_FLAGS)) {
7721 if (head) {
7722 link->last->link = req;
7723 link->head = NULL;
7724 req = head;
7725 }
7726 io_queue_sqe_fallback(req);
7727 return ret;
7728 }
7729
7730 if (head)
7731 link->last->link = req;
7732 else
7733 link->head = req;
7734 link->last = req;
7735 return 0;
7736}
7737
7738static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
a1ab7b35 7739 const struct io_uring_sqe *sqe)
282cdc86 7740 __must_hold(&ctx->uring_lock)
9e645e11 7741{
a1ab7b35 7742 struct io_submit_link *link = &ctx->submit_state.link;
ef4ff581 7743 int ret;
9e645e11 7744
a6b8cadc 7745 ret = io_init_req(ctx, req, sqe);
df3becde
PB
7746 if (unlikely(ret))
7747 return io_submit_fail_init(sqe, req, ret);
441b8a78 7748
be7053b7 7749 /* don't need @sqe from now on */
cef216fc 7750 trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
236daeae
OL
7751 req->flags, true,
7752 ctx->flags & IORING_SETUP_SQPOLL);
a6b8cadc 7753
9e645e11
JA
7754 /*
7755 * If we already have a head request, queue this one for async
7756 * submittal once the head completes. If we don't have a head but
7757 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7758 * submitted sync once the chain is complete. If none of those
7759 * conditions are true (normal request), then just queue it.
7760 */
924a07e4 7761 if (unlikely(link->head)) {
df3becde
PB
7762 ret = io_req_prep_async(req);
7763 if (unlikely(ret))
7764 return io_submit_fail_init(sqe, req, ret);
7765
7766 trace_io_uring_link(ctx, req, link->head);
f2f87370 7767 link->last->link = req;
863e0560 7768 link->last = req;
32fe525b 7769
da1a08c5 7770 if (req->flags & IO_REQ_LINK_FLAGS)
f15a3431 7771 return 0;
df3becde
PB
7772 /* last request of the link, flush it */
7773 req = link->head;
f15a3431 7774 link->head = NULL;
924a07e4
PB
7775 if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
7776 goto fallback;
7777
7778 } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
7779 REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
7780 if (req->flags & IO_REQ_LINK_FLAGS) {
7781 link->head = req;
7782 link->last = req;
7783 } else {
7784fallback:
7785 io_queue_sqe_fallback(req);
7786 }
f15a3431 7787 return 0;
9e645e11 7788 }
2e6e1fde 7789
924a07e4 7790 io_queue_sqe(req);
1d4240cc 7791 return 0;
9e645e11
JA
7792}
7793
9a56a232
JA
7794/*
7795 * Batched submission is done, ensure local IO is flushed out.
7796 */
553deffd 7797static void io_submit_state_end(struct io_ring_ctx *ctx)
9a56a232 7798{
553deffd
PB
7799 struct io_submit_state *state = &ctx->submit_state;
7800
e126391c
PB
7801 if (unlikely(state->link.head))
7802 io_queue_sqe_fallback(state->link.head);
553deffd 7803 /* flush only after queuing links as they can generate completions */
c450178d 7804 io_submit_flush_completions(ctx);
27926b68
JA
7805 if (state->plug_started)
7806 blk_finish_plug(&state->plug);
9a56a232
JA
7807}
7808
7809/*
7810 * Start submission side cache.
7811 */
7812static void io_submit_state_start(struct io_submit_state *state,
ba88ff11 7813 unsigned int max_ios)
9a56a232 7814{
27926b68 7815 state->plug_started = false;
4b628aeb 7816 state->need_plug = max_ios > 2;
5ca7a8b3 7817 state->submit_nr = max_ios;
a1ab7b35
PB
7818 /* set only head, no need to init link_last in advance */
7819 state->link.head = NULL;
9a56a232
JA
7820}
7821
2b188cc1
JA
7822static void io_commit_sqring(struct io_ring_ctx *ctx)
7823{
75b28aff 7824 struct io_rings *rings = ctx->rings;
2b188cc1 7825
caf582c6
PB
7826 /*
7827 * Ensure any loads from the SQEs are done at this point,
7828 * since once we write the new head, the application could
7829 * write new data to them.
7830 */
7831 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
7832}
7833
2b188cc1 7834/*
dd9ae8a0 7835 * Fetch an sqe, if one is available. Note this returns a pointer to memory
2b188cc1
JA
7836 * that is mapped by userspace. This means that care needs to be taken to
7837 * ensure that reads are stable, as we cannot rely on userspace always
7838 * being a good citizen. If members of the sqe are validated and then later
7839 * used, it's important that those reads are done through READ_ONCE() to
7840 * prevent a re-load down the line.
7841 */
709b302f 7842static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2b188cc1 7843{
ea5ab3b5 7844 unsigned head, mask = ctx->sq_entries - 1;
17d3aeb3 7845 unsigned sq_idx = ctx->cached_sq_head++ & mask;
2b188cc1
JA
7846
7847 /*
7848 * The cached sq head (or cq tail) serves two purposes:
7849 *
7850 * 1) allows us to batch the cost of updating the user visible
7851 * head updates.
7852 * 2) allows the kernel side to track the head on its own, even
7853 * though the application is the one updating it.
7854 */
17d3aeb3 7855 head = READ_ONCE(ctx->sq_array[sq_idx]);
709b302f
PB
7856 if (likely(head < ctx->sq_entries))
7857 return &ctx->sq_sqes[head];
2b188cc1
JA
7858
7859 /* drop invalid entries */
15641e42
PB
7860 ctx->cq_extra--;
7861 WRITE_ONCE(ctx->rings->sq_dropped,
7862 READ_ONCE(ctx->rings->sq_dropped) + 1);
709b302f
PB
7863 return NULL;
7864}
7865
0f212204 7866static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
282cdc86 7867 __must_hold(&ctx->uring_lock)
6c271ce2 7868{
69629809 7869 unsigned int entries = io_sqring_entries(ctx);
8e6971a8
PB
7870 unsigned int left;
7871 int ret;
6c271ce2 7872
51d48dab 7873 if (unlikely(!entries))
69629809 7874 return 0;
ee7d46d9 7875 /* make sure SQ entry isn't read before tail */
8e6971a8
PB
7876 ret = left = min3(nr, ctx->sq_entries, entries);
7877 io_get_task_refs(left);
7878 io_submit_state_start(&ctx->submit_state, left);
6c271ce2 7879
69629809 7880 do {
3529d8c2 7881 const struct io_uring_sqe *sqe;
196be95c 7882 struct io_kiocb *req;
fb5ccc98 7883
8e6971a8 7884 if (unlikely(!io_alloc_req_refill(ctx)))
fb5ccc98 7885 break;
a33ae9ce 7886 req = io_alloc_req(ctx);
4fccfcbb
PB
7887 sqe = io_get_sqe(ctx);
7888 if (unlikely(!sqe)) {
fa05457a 7889 io_req_add_to_cache(req, ctx);
4fccfcbb
PB
7890 break;
7891 }
1cd15904
PB
7892
7893 /*
7894 * Continue submitting even for sqe failure if the
7895 * ring was setup with IORING_SETUP_SUBMIT_ALL
7896 */
7897 if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
7898 !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
7899 left--;
7900 break;
bcbb7bf6 7901 }
1cd15904 7902 } while (--left);
9466f437 7903
8e6971a8
PB
7904 if (unlikely(left)) {
7905 ret -= left;
7906 /* try again if it submitted nothing and can't allocate a req */
7907 if (!ret && io_req_cache_empty(ctx))
7908 ret = -EAGAIN;
7909 current->io_uring->cached_refs += left;
9466f437 7910 }
6c271ce2 7911
553deffd 7912 io_submit_state_end(ctx);
ae9428ca
PB
7913 /* Commit SQ ring head once we've consumed and submitted all SQEs */
7914 io_commit_sqring(ctx);
8e6971a8 7915 return ret;
6c271ce2
JA
7916}
7917
e4b6d902
PB
7918static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7919{
7920 return READ_ONCE(sqd->state);
7921}
7922
23b3628e
XW
7923static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7924{
7925 /* Tell userspace we may need a wakeup call */
79ebeaee 7926 spin_lock(&ctx->completion_lock);
20c0b380
NA
7927 WRITE_ONCE(ctx->rings->sq_flags,
7928 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
79ebeaee 7929 spin_unlock(&ctx->completion_lock);
23b3628e
XW
7930}
7931
7932static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
7933{
79ebeaee 7934 spin_lock(&ctx->completion_lock);
20c0b380
NA
7935 WRITE_ONCE(ctx->rings->sq_flags,
7936 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
79ebeaee 7937 spin_unlock(&ctx->completion_lock);
23b3628e
XW
7938}
7939
08369246 7940static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6c271ce2 7941{
c8d1ba58 7942 unsigned int to_submit;
bdcd3eab 7943 int ret = 0;
6c271ce2 7944
c8d1ba58 7945 to_submit = io_sqring_entries(ctx);
e95eee2d 7946 /* if we're handling multiple rings, cap submit size for fairness */
4ce8ad95
OL
7947 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7948 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
e95eee2d 7949
5eef4e87 7950 if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
948e1947
PB
7951 const struct cred *creds = NULL;
7952
7953 if (ctx->sq_creds != current_cred())
7954 creds = override_creds(ctx->sq_creds);
a4c0b3de 7955
c8d1ba58 7956 mutex_lock(&ctx->uring_lock);
5eef4e87 7957 if (!wq_list_empty(&ctx->iopoll_list))
5ba3c874 7958 io_do_iopoll(ctx, true);
906a3c6f 7959
3b763ba1
PB
7960 /*
7961 * Don't submit if refs are dying, good for io_uring_register(),
7962 * but also it is relied upon by io_ring_exit_work()
7963 */
0298ef96
PB
7964 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7965 !(ctx->flags & IORING_SETUP_R_DISABLED))
08369246 7966 ret = io_submit_sqes(ctx, to_submit);
c8d1ba58 7967 mutex_unlock(&ctx->uring_lock);
cb318216 7968
acfb381d
PB
7969 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7970 wake_up(&ctx->sqo_sq_wait);
948e1947
PB
7971 if (creds)
7972 revert_creds(creds);
acfb381d 7973 }
6c271ce2 7974
08369246
XW
7975 return ret;
7976}
6c271ce2 7977
c072481d 7978static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
08369246
XW
7979{
7980 struct io_ring_ctx *ctx;
7981 unsigned sq_thread_idle = 0;
6c271ce2 7982
c9dca27d
PB
7983 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7984 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
08369246 7985 sqd->sq_thread_idle = sq_thread_idle;
c8d1ba58 7986}
6c271ce2 7987
e4b6d902
PB
7988static bool io_sqd_handle_event(struct io_sq_data *sqd)
7989{
7990 bool did_sig = false;
7991 struct ksignal ksig;
7992
7993 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7994 signal_pending(current)) {
7995 mutex_unlock(&sqd->lock);
7996 if (signal_pending(current))
7997 did_sig = get_signal(&ksig);
7998 cond_resched();
7999 mutex_lock(&sqd->lock);
8000 }
e4b6d902
PB
8001 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
8002}
8003
c8d1ba58
JA
8004static int io_sq_thread(void *data)
8005{
69fb2131
JA
8006 struct io_sq_data *sqd = data;
8007 struct io_ring_ctx *ctx;
a0d9205f 8008 unsigned long timeout = 0;
37d1e2e3 8009 char buf[TASK_COMM_LEN];
08369246 8010 DEFINE_WAIT(wait);
6c271ce2 8011
696ee88a 8012 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
37d1e2e3 8013 set_task_comm(current, buf);
37d1e2e3
JA
8014
8015 if (sqd->sq_cpu != -1)
8016 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
8017 else
8018 set_cpus_allowed_ptr(current, cpu_online_mask);
8019 current->flags |= PF_NO_SETAFFINITY;
8020
5bd2182d
PM
8021 audit_alloc_kernel(current);
8022
09a6f4ef 8023 mutex_lock(&sqd->lock);
e4b6d902 8024 while (1) {
1a924a80 8025 bool cap_entries, sqt_spin = false;
c1edbf5f 8026
e4b6d902
PB
8027 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
8028 if (io_sqd_handle_event(sqd))
c7d95613 8029 break;
08369246
XW
8030 timeout = jiffies + sqd->sq_thread_idle;
8031 }
e4b6d902 8032
e95eee2d 8033 cap_entries = !list_is_singular(&sqd->ctx_list);
69fb2131 8034 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
948e1947 8035 int ret = __io_sq_thread(ctx, cap_entries);
7c30f36a 8036
5eef4e87 8037 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
08369246 8038 sqt_spin = true;
69fb2131 8039 }
dd432ea5
PB
8040 if (io_run_task_work())
8041 sqt_spin = true;
6c271ce2 8042
08369246 8043 if (sqt_spin || !time_after(jiffies, timeout)) {
c8d1ba58 8044 cond_resched();
08369246
XW
8045 if (sqt_spin)
8046 timeout = jiffies + sqd->sq_thread_idle;
8047 continue;
8048 }
8049
08369246 8050 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7f62d40d 8051 if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
1a924a80
PB
8052 bool needs_sched = true;
8053
724cb4f9 8054 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
aaa9f0f4
PB
8055 io_ring_set_wakeup_flag(ctx);
8056
724cb4f9 8057 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
5eef4e87 8058 !wq_list_empty(&ctx->iopoll_list)) {
724cb4f9
HX
8059 needs_sched = false;
8060 break;
8061 }
649bb75d
AK
8062
8063 /*
8064 * Ensure the store of the wakeup flag is not
8065 * reordered with the load of the SQ tail
8066 */
8067 smp_mb();
8068
724cb4f9
HX
8069 if (io_sqring_entries(ctx)) {
8070 needs_sched = false;
8071 break;
8072 }
8073 }
8074
8075 if (needs_sched) {
8076 mutex_unlock(&sqd->lock);
8077 schedule();
8078 mutex_lock(&sqd->lock);
8079 }
69fb2131
JA
8080 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
8081 io_ring_clear_wakeup_flag(ctx);
6c271ce2 8082 }
08369246
XW
8083
8084 finish_wait(&sqd->wait, &wait);
8085 timeout = jiffies + sqd->sq_thread_idle;
6c271ce2 8086 }
28cea78a 8087
78cc687b 8088 io_uring_cancel_generic(true, sqd);
37d1e2e3 8089 sqd->thread = NULL;
05962f95 8090 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
5f3f26f9 8091 io_ring_set_wakeup_flag(ctx);
521d6a73 8092 io_run_task_work();
734551df
PB
8093 mutex_unlock(&sqd->lock);
8094
5bd2182d
PM
8095 audit_free(current);
8096
37d1e2e3
JA
8097 complete(&sqd->exited);
8098 do_exit(0);
6c271ce2
JA
8099}
8100
bda52162
JA
8101struct io_wait_queue {
8102 struct wait_queue_entry wq;
8103 struct io_ring_ctx *ctx;
5fd46178 8104 unsigned cq_tail;
bda52162
JA
8105 unsigned nr_timeouts;
8106};
8107
6c503150 8108static inline bool io_should_wake(struct io_wait_queue *iowq)
bda52162
JA
8109{
8110 struct io_ring_ctx *ctx = iowq->ctx;
5fd46178 8111 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
bda52162
JA
8112
8113 /*
d195a66e 8114 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
8115 * started waiting. For timeouts, we always want to return to userspace,
8116 * regardless of event count.
8117 */
5fd46178 8118 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
bda52162
JA
8119}
8120
8121static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
8122 int wake_flags, void *key)
8123{
8124 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
8125 wq);
8126
6c503150
PB
8127 /*
8128 * Cannot safely flush overflowed CQEs from here, ensure we wake up
8129 * the task, and the next invocation will do it.
8130 */
5ed7a37d 8131 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
6c503150
PB
8132 return autoremove_wake_function(curr, mode, wake_flags, key);
8133 return -1;
bda52162
JA
8134}
8135
af9c1a44
JA
8136static int io_run_task_work_sig(void)
8137{
8138 if (io_run_task_work())
8139 return 1;
0b8cfa97 8140 if (test_thread_flag(TIF_NOTIFY_SIGNAL))
792ee0f6 8141 return -ERESTARTSYS;
c5020bc8
OL
8142 if (task_sigpending(current))
8143 return -EINTR;
8144 return 0;
af9c1a44
JA
8145}
8146
eeb60b9a
PB
8147/* when returns >0, the caller should retry */
8148static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
8149 struct io_wait_queue *iowq,
22833966 8150 ktime_t timeout)
eeb60b9a
PB
8151{
8152 int ret;
8153
8154 /* make sure we run task_work before checking for signals */
8155 ret = io_run_task_work_sig();
8156 if (ret || io_should_wake(iowq))
8157 return ret;
8158 /* let the caller flush overflows, retry */
5ed7a37d 8159 if (test_bit(0, &ctx->check_cq_overflow))
eeb60b9a
PB
8160 return 1;
8161
22833966
JA
8162 if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
8163 return -ETIME;
8164 return 1;
eeb60b9a
PB
8165}
8166
2b188cc1
JA
8167/*
8168 * Wait until events become available, if we don't already have some. The
8169 * application must reap them itself, as they reside on the shared cq ring.
8170 */
8171static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
c73ebb68
HX
8172 const sigset_t __user *sig, size_t sigsz,
8173 struct __kernel_timespec __user *uts)
2b188cc1 8174{
90291099 8175 struct io_wait_queue iowq;
75b28aff 8176 struct io_rings *rings = ctx->rings;
22833966 8177 ktime_t timeout = KTIME_MAX;
c1d5a224 8178 int ret;
2b188cc1 8179
b41e9852 8180 do {
90f67366 8181 io_cqring_overflow_flush(ctx);
6c503150 8182 if (io_cqring_events(ctx) >= min_events)
b41e9852 8183 return 0;
4c6e277c 8184 if (!io_run_task_work())
b41e9852 8185 break;
b41e9852 8186 } while (1);
2b188cc1
JA
8187
8188 if (sig) {
9e75ad5d
AB
8189#ifdef CONFIG_COMPAT
8190 if (in_compat_syscall())
8191 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 8192 sigsz);
9e75ad5d
AB
8193 else
8194#endif
b772434b 8195 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 8196
2b188cc1
JA
8197 if (ret)
8198 return ret;
8199 }
8200
950e79dd
OL
8201 if (uts) {
8202 struct timespec64 ts;
8203
8204 if (get_timespec64(&ts, uts))
8205 return -EFAULT;
8206 timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
8207 }
8208
90291099
PB
8209 init_waitqueue_func_entry(&iowq.wq, io_wake_function);
8210 iowq.wq.private = current;
8211 INIT_LIST_HEAD(&iowq.wq.entry);
8212 iowq.ctx = ctx;
bda52162 8213 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
5fd46178 8214 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
90291099 8215
c826bd7a 8216 trace_io_uring_cqring_wait(ctx, min_events);
bda52162 8217 do {
ca0a2651 8218 /* if we can't even flush overflow, don't wait for more */
90f67366 8219 if (!io_cqring_overflow_flush(ctx)) {
ca0a2651
JA
8220 ret = -EBUSY;
8221 break;
8222 }
311997b3 8223 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
bda52162 8224 TASK_INTERRUPTIBLE);
22833966 8225 ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
ca0a2651 8226 cond_resched();
eeb60b9a 8227 } while (ret > 0);
bda52162 8228
b4f20bb4 8229 finish_wait(&ctx->cq_wait, &iowq.wq);
b7db41c9 8230 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 8231
75b28aff 8232 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
8233}
8234
9123c8ff 8235static void io_free_page_table(void **table, size_t size)
05f3fb3c 8236{
9123c8ff 8237 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
05f3fb3c 8238
846a4ef2 8239 for (i = 0; i < nr_tables; i++)
9123c8ff
PB
8240 kfree(table[i]);
8241 kfree(table);
8242}
8243
c072481d 8244static __cold void **io_alloc_page_table(size_t size)
9123c8ff
PB
8245{
8246 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
8247 size_t init_size = size;
8248 void **table;
8249
0bea96f5 8250 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
9123c8ff
PB
8251 if (!table)
8252 return NULL;
8253
8254 for (i = 0; i < nr_tables; i++) {
27f6b318 8255 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
9123c8ff 8256
0bea96f5 8257 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
9123c8ff
PB
8258 if (!table[i]) {
8259 io_free_page_table(table, init_size);
8260 return NULL;
8261 }
8262 size -= this_size;
8263 }
8264 return table;
05f3fb3c
JA
8265}
8266
28a9fe25 8267static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
1642b445 8268{
28a9fe25
PB
8269 percpu_ref_exit(&ref_node->refs);
8270 kfree(ref_node);
1642b445
PB
8271}
8272
c072481d 8273static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
b9bd2bea
PB
8274{
8275 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
8276 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
8277 unsigned long flags;
8278 bool first_add = false;
b36a2050 8279 unsigned long delay = HZ;
b9bd2bea
PB
8280
8281 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
8282 node->done = true;
8283
b36a2050
DY
8284 /* if we are mid-quiesce then do not delay */
8285 if (node->rsrc_data->quiesce)
8286 delay = 0;
8287
b9bd2bea
PB
8288 while (!list_empty(&ctx->rsrc_ref_list)) {
8289 node = list_first_entry(&ctx->rsrc_ref_list,
8290 struct io_rsrc_node, node);
8291 /* recycle ref nodes in order */
8292 if (!node->done)
8293 break;
8294 list_del(&node->node);
8295 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
8296 }
8297 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
8298
8299 if (first_add)
b36a2050 8300 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
b9bd2bea
PB
8301}
8302
f6133fbd 8303static struct io_rsrc_node *io_rsrc_node_alloc(void)
b9bd2bea
PB
8304{
8305 struct io_rsrc_node *ref_node;
8306
8307 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
8308 if (!ref_node)
8309 return NULL;
8310
8311 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
8312 0, GFP_KERNEL)) {
8313 kfree(ref_node);
8314 return NULL;
8315 }
8316 INIT_LIST_HEAD(&ref_node->node);
8317 INIT_LIST_HEAD(&ref_node->rsrc_list);
8318 ref_node->done = false;
8319 return ref_node;
8320}
8321
a7f0ed5a
PB
8322static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
8323 struct io_rsrc_data *data_to_kill)
ab409402 8324 __must_hold(&ctx->uring_lock)
6b06314c 8325{
a7f0ed5a
PB
8326 WARN_ON_ONCE(!ctx->rsrc_backup_node);
8327 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
6b06314c 8328
ab409402
PB
8329 io_rsrc_refs_drop(ctx);
8330
a7f0ed5a
PB
8331 if (data_to_kill) {
8332 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
82fbcfa9 8333
a7f0ed5a 8334 rsrc_node->rsrc_data = data_to_kill;
4956b9ea 8335 spin_lock_irq(&ctx->rsrc_ref_lock);
a7f0ed5a 8336 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
4956b9ea 8337 spin_unlock_irq(&ctx->rsrc_ref_lock);
82fbcfa9 8338
3e942498 8339 atomic_inc(&data_to_kill->refs);
a7f0ed5a
PB
8340 percpu_ref_kill(&rsrc_node->refs);
8341 ctx->rsrc_node = NULL;
8342 }
6b06314c 8343
a7f0ed5a
PB
8344 if (!ctx->rsrc_node) {
8345 ctx->rsrc_node = ctx->rsrc_backup_node;
8346 ctx->rsrc_backup_node = NULL;
8347 }
8bad28d8
HX
8348}
8349
a7f0ed5a 8350static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
8dd03afe
PB
8351{
8352 if (ctx->rsrc_backup_node)
8353 return 0;
f6133fbd 8354 ctx->rsrc_backup_node = io_rsrc_node_alloc();
8dd03afe 8355 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
8bad28d8
HX
8356}
8357
c072481d
PB
8358static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
8359 struct io_ring_ctx *ctx)
8bad28d8
HX
8360{
8361 int ret;
05589553 8362
215c3902 8363 /* As we may drop ->uring_lock, other task may have started quiesce */
8bad28d8
HX
8364 if (data->quiesce)
8365 return -ENXIO;
05589553 8366
8bad28d8 8367 data->quiesce = true;
1ffc5422 8368 do {
a7f0ed5a 8369 ret = io_rsrc_node_switch_start(ctx);
8dd03afe 8370 if (ret)
f2303b1f 8371 break;
a7f0ed5a 8372 io_rsrc_node_switch(ctx, data);
f2303b1f 8373
3e942498
PB
8374 /* kill initial ref, already quiesced if zero */
8375 if (atomic_dec_and_test(&data->refs))
8376 break;
c018db4a 8377 mutex_unlock(&ctx->uring_lock);
8bad28d8 8378 flush_delayed_work(&ctx->rsrc_put_work);
1ffc5422 8379 ret = wait_for_completion_interruptible(&data->done);
c018db4a
JA
8380 if (!ret) {
8381 mutex_lock(&ctx->uring_lock);
80912cef
DY
8382 if (atomic_read(&data->refs) > 0) {
8383 /*
8384 * it has been revived by another thread while
8385 * we were unlocked
8386 */
8387 mutex_unlock(&ctx->uring_lock);
8388 } else {
8389 break;
8390 }
c018db4a 8391 }
8bad28d8 8392
3e942498
PB
8393 atomic_inc(&data->refs);
8394 /* wait for all works potentially completing data->done */
8395 flush_delayed_work(&ctx->rsrc_put_work);
cb5e1b81 8396 reinit_completion(&data->done);
8dd03afe 8397
1ffc5422 8398 ret = io_run_task_work_sig();
8bad28d8 8399 mutex_lock(&ctx->uring_lock);
f2303b1f 8400 } while (ret >= 0);
8bad28d8 8401 data->quiesce = false;
05f3fb3c 8402
8bad28d8 8403 return ret;
d7954b2b
BM
8404}
8405
2d091d62
PB
8406static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
8407{
8408 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
8409 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
8410
8411 return &data->tags[table_idx][off];
8412}
8413
44b31f2f 8414static void io_rsrc_data_free(struct io_rsrc_data *data)
1ad555c6 8415{
2d091d62
PB
8416 size_t size = data->nr * sizeof(data->tags[0][0]);
8417
8418 if (data->tags)
8419 io_free_page_table((void **)data->tags, size);
44b31f2f
PB
8420 kfree(data);
8421}
8422
c072481d
PB
8423static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
8424 u64 __user *utags, unsigned nr,
8425 struct io_rsrc_data **pdata)
1ad555c6 8426{
b895c9a6 8427 struct io_rsrc_data *data;
2d091d62 8428 int ret = -ENOMEM;
d878c816 8429 unsigned i;
1ad555c6
BM
8430
8431 data = kzalloc(sizeof(*data), GFP_KERNEL);
8432 if (!data)
d878c816 8433 return -ENOMEM;
2d091d62 8434 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
b60c8dce 8435 if (!data->tags) {
1ad555c6 8436 kfree(data);
d878c816
PB
8437 return -ENOMEM;
8438 }
2d091d62
PB
8439
8440 data->nr = nr;
8441 data->ctx = ctx;
8442 data->do_put = do_put;
d878c816 8443 if (utags) {
2d091d62 8444 ret = -EFAULT;
d878c816 8445 for (i = 0; i < nr; i++) {
fdd1dc31
CIK
8446 u64 *tag_slot = io_get_tag_slot(data, i);
8447
8448 if (copy_from_user(tag_slot, &utags[i],
8449 sizeof(*tag_slot)))
2d091d62 8450 goto fail;
d878c816 8451 }
1ad555c6 8452 }
b60c8dce 8453
3e942498 8454 atomic_set(&data->refs, 1);
1ad555c6 8455 init_completion(&data->done);
d878c816
PB
8456 *pdata = data;
8457 return 0;
2d091d62
PB
8458fail:
8459 io_rsrc_data_free(data);
8460 return ret;
1ad555c6
BM
8461}
8462
9123c8ff
PB
8463static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
8464{
0bea96f5
PB
8465 table->files = kvcalloc(nr_files, sizeof(table->files[0]),
8466 GFP_KERNEL_ACCOUNT);
9123c8ff
PB
8467 return !!table->files;
8468}
8469
042b0d85 8470static void io_free_file_tables(struct io_file_table *table)
9123c8ff 8471{
042b0d85 8472 kvfree(table->files);
9123c8ff
PB
8473 table->files = NULL;
8474}
8475
fff4db76 8476static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
1ad555c6 8477{
1f59bc0f
PB
8478 int i;
8479
8480 for (i = 0; i < ctx->nr_user_files; i++) {
8481 struct file *file = io_file_from_index(ctx, i);
8482
8483 if (!file || io_file_need_scm(file))
8484 continue;
8485 io_fixed_file_slot(&ctx->file_table, i)->file_ptr = 0;
8486 fput(file);
8487 }
8488
fff4db76
PB
8489#if defined(CONFIG_UNIX)
8490 if (ctx->ring_sock) {
8491 struct sock *sock = ctx->ring_sock->sk;
8492 struct sk_buff *skb;
8493
8494 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
8495 kfree_skb(skb);
8496 }
fff4db76 8497#endif
042b0d85 8498 io_free_file_tables(&ctx->file_table);
44b31f2f 8499 io_rsrc_data_free(ctx->file_data);
fff4db76
PB
8500 ctx->file_data = NULL;
8501 ctx->nr_user_files = 0;
1ad555c6
BM
8502}
8503
d7954b2b
BM
8504static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
8505{
d7954b2b
BM
8506 int ret;
8507
08480400 8508 if (!ctx->file_data)
d7954b2b 8509 return -ENXIO;
08480400
PB
8510 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
8511 if (!ret)
8512 __io_sqe_files_unregister(ctx);
8513 return ret;
6b06314c
JA
8514}
8515
37d1e2e3 8516static void io_sq_thread_unpark(struct io_sq_data *sqd)
09a6f4ef 8517 __releases(&sqd->lock)
37d1e2e3 8518{
521d6a73
PB
8519 WARN_ON_ONCE(sqd->thread == current);
8520
9e138a48
PB
8521 /*
8522 * Do the dance but not conditional clear_bit() because it'd race with
8523 * other threads incrementing park_pending and setting the bit.
8524 */
37d1e2e3 8525 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
9e138a48
PB
8526 if (atomic_dec_return(&sqd->park_pending))
8527 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 8528 mutex_unlock(&sqd->lock);
37d1e2e3
JA
8529}
8530
86e0d676 8531static void io_sq_thread_park(struct io_sq_data *sqd)
09a6f4ef 8532 __acquires(&sqd->lock)
37d1e2e3 8533{
521d6a73
PB
8534 WARN_ON_ONCE(sqd->thread == current);
8535
9e138a48 8536 atomic_inc(&sqd->park_pending);
86e0d676 8537 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 8538 mutex_lock(&sqd->lock);
05962f95 8539 if (sqd->thread)
86e0d676 8540 wake_up_process(sqd->thread);
37d1e2e3
JA
8541}
8542
8543static void io_sq_thread_stop(struct io_sq_data *sqd)
8544{
521d6a73 8545 WARN_ON_ONCE(sqd->thread == current);
88885f66 8546 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
521d6a73 8547
05962f95 8548 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
88885f66 8549 mutex_lock(&sqd->lock);
e8f98f24
JA
8550 if (sqd->thread)
8551 wake_up_process(sqd->thread);
09a6f4ef 8552 mutex_unlock(&sqd->lock);
05962f95 8553 wait_for_completion(&sqd->exited);
37d1e2e3
JA
8554}
8555
534ca6d6 8556static void io_put_sq_data(struct io_sq_data *sqd)
6c271ce2 8557{
534ca6d6 8558 if (refcount_dec_and_test(&sqd->refs)) {
9e138a48
PB
8559 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
8560
37d1e2e3
JA
8561 io_sq_thread_stop(sqd);
8562 kfree(sqd);
8563 }
8564}
8565
8566static void io_sq_thread_finish(struct io_ring_ctx *ctx)
8567{
8568 struct io_sq_data *sqd = ctx->sq_data;
8569
8570 if (sqd) {
05962f95 8571 io_sq_thread_park(sqd);
521d6a73 8572 list_del_init(&ctx->sqd_list);
37d1e2e3 8573 io_sqd_update_thread_idle(sqd);
05962f95 8574 io_sq_thread_unpark(sqd);
37d1e2e3
JA
8575
8576 io_put_sq_data(sqd);
8577 ctx->sq_data = NULL;
534ca6d6
JA
8578 }
8579}
8580
aa06165d
JA
8581static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
8582{
8583 struct io_ring_ctx *ctx_attach;
8584 struct io_sq_data *sqd;
8585 struct fd f;
8586
8587 f = fdget(p->wq_fd);
8588 if (!f.file)
8589 return ERR_PTR(-ENXIO);
8590 if (f.file->f_op != &io_uring_fops) {
8591 fdput(f);
8592 return ERR_PTR(-EINVAL);
8593 }
8594
8595 ctx_attach = f.file->private_data;
8596 sqd = ctx_attach->sq_data;
8597 if (!sqd) {
8598 fdput(f);
8599 return ERR_PTR(-EINVAL);
8600 }
5c2469e0
JA
8601 if (sqd->task_tgid != current->tgid) {
8602 fdput(f);
8603 return ERR_PTR(-EPERM);
8604 }
aa06165d
JA
8605
8606 refcount_inc(&sqd->refs);
8607 fdput(f);
8608 return sqd;
8609}
8610
26984fbf
PB
8611static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
8612 bool *attached)
534ca6d6
JA
8613{
8614 struct io_sq_data *sqd;
8615
26984fbf 8616 *attached = false;
5c2469e0
JA
8617 if (p->flags & IORING_SETUP_ATTACH_WQ) {
8618 sqd = io_attach_sq_data(p);
26984fbf
PB
8619 if (!IS_ERR(sqd)) {
8620 *attached = true;
5c2469e0 8621 return sqd;
26984fbf 8622 }
5c2469e0
JA
8623 /* fall through for EPERM case, setup new sqd/task */
8624 if (PTR_ERR(sqd) != -EPERM)
8625 return sqd;
8626 }
aa06165d 8627
534ca6d6
JA
8628 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
8629 if (!sqd)
8630 return ERR_PTR(-ENOMEM);
8631
9e138a48 8632 atomic_set(&sqd->park_pending, 0);
534ca6d6 8633 refcount_set(&sqd->refs, 1);
69fb2131 8634 INIT_LIST_HEAD(&sqd->ctx_list);
09a6f4ef 8635 mutex_init(&sqd->lock);
534ca6d6 8636 init_waitqueue_head(&sqd->wait);
37d1e2e3 8637 init_completion(&sqd->exited);
534ca6d6
JA
8638 return sqd;
8639}
8640
6b06314c
JA
8641/*
8642 * Ensure the UNIX gc is aware of our file set, so we are certain that
8643 * the io_uring can be safely unregistered on process exit, even if we have
1f59bc0f
PB
8644 * loops in the file referencing. We account only files that can hold other
8645 * files because otherwise they can't form a loop and so are not interesting
8646 * for GC.
6b06314c 8647 */
8b3171bd 8648static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
6b06314c 8649{
73b25d3b 8650#if defined(CONFIG_UNIX)
6b06314c 8651 struct sock *sk = ctx->ring_sock->sk;
73b25d3b 8652 struct sk_buff_head *head = &sk->sk_receive_queue;
6b06314c
JA
8653 struct scm_fp_list *fpl;
8654 struct sk_buff *skb;
6b06314c 8655
73b25d3b
PB
8656 if (likely(!io_file_need_scm(file)))
8657 return 0;
8658
8659 /*
8660 * See if we can merge this file into an existing skb SCM_RIGHTS
8661 * file set. If there's no room, fall back to allocating a new skb
8662 * and filling it in.
8663 */
8664 spin_lock_irq(&head->lock);
8665 skb = skb_peek(head);
8666 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
8667 __skb_unlink(skb, head);
8668 else
8669 skb = NULL;
8670 spin_unlock_irq(&head->lock);
6b06314c 8671
6b06314c 8672 if (!skb) {
73b25d3b
PB
8673 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
8674 if (!fpl)
8675 return -ENOMEM;
6b06314c 8676
73b25d3b
PB
8677 skb = alloc_skb(0, GFP_KERNEL);
8678 if (!skb) {
8679 kfree(fpl);
8680 return -ENOMEM;
8681 }
6b06314c 8682
73b25d3b
PB
8683 fpl->user = get_uid(current_user());
8684 fpl->max = SCM_MAX_FD;
8685 fpl->count = 0;
dca58c6a 8686
73b25d3b
PB
8687 UNIXCB(skb).fp = fpl;
8688 skb->sk = sk;
8689 skb->destructor = unix_destruct_scm;
8690 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
8691 }
8692
8693 fpl = UNIXCB(skb).fp;
8694 fpl->fp[fpl->count++] = get_file(file);
8695 unix_inflight(fpl->user, file);
8696 skb_queue_head(head, skb);
dca58c6a 8697 fput(file);
73b25d3b 8698#endif
6b06314c
JA
8699 return 0;
8700}
6b06314c 8701
47e90392 8702static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
05f3fb3c 8703{
50238531 8704 struct file *file = prsrc->file;
05f3fb3c
JA
8705#if defined(CONFIG_UNIX)
8706 struct sock *sock = ctx->ring_sock->sk;
8707 struct sk_buff_head list, *head = &sock->sk_receive_queue;
8708 struct sk_buff *skb;
8709 int i;
8710
1f59bc0f
PB
8711 if (!io_file_need_scm(file)) {
8712 fput(file);
8713 return;
8714 }
8715
05f3fb3c
JA
8716 __skb_queue_head_init(&list);
8717
8718 /*
8719 * Find the skb that holds this file in its SCM_RIGHTS. When found,
8720 * remove this entry and rearrange the file array.
8721 */
8722 skb = skb_dequeue(head);
8723 while (skb) {
8724 struct scm_fp_list *fp;
8725
8726 fp = UNIXCB(skb).fp;
8727 for (i = 0; i < fp->count; i++) {
8728 int left;
8729
8730 if (fp->fp[i] != file)
8731 continue;
8732
8733 unix_notinflight(fp->user, fp->fp[i]);
8734 left = fp->count - 1 - i;
8735 if (left) {
8736 memmove(&fp->fp[i], &fp->fp[i + 1],
8737 left * sizeof(struct file *));
8738 }
8739 fp->count--;
8740 if (!fp->count) {
8741 kfree_skb(skb);
8742 skb = NULL;
8743 } else {
8744 __skb_queue_tail(&list, skb);
8745 }
8746 fput(file);
8747 file = NULL;
8748 break;
8749 }
8750
8751 if (!file)
8752 break;
8753
8754 __skb_queue_tail(&list, skb);
8755
8756 skb = skb_dequeue(head);
8757 }
8758
8759 if (skb_peek(&list)) {
8760 spin_lock_irq(&head->lock);
8761 while ((skb = __skb_dequeue(&list)) != NULL)
8762 __skb_queue_tail(head, skb);
8763 spin_unlock_irq(&head->lock);
8764 }
8765#else
8766 fput(file);
8767#endif
8768}
8769
b895c9a6 8770static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
65e19f54 8771{
b895c9a6 8772 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
269bbe5f
BM
8773 struct io_ring_ctx *ctx = rsrc_data->ctx;
8774 struct io_rsrc_put *prsrc, *tmp;
05589553 8775
269bbe5f
BM
8776 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8777 list_del(&prsrc->list);
b60c8dce
PB
8778
8779 if (prsrc->tag) {
f8929630
PB
8780 if (ctx->flags & IORING_SETUP_IOPOLL)
8781 mutex_lock(&ctx->uring_lock);
b60c8dce 8782
79ebeaee 8783 spin_lock(&ctx->completion_lock);
913a571a 8784 io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
b60c8dce 8785 io_commit_cqring(ctx);
79ebeaee 8786 spin_unlock(&ctx->completion_lock);
b60c8dce 8787 io_cqring_ev_posted(ctx);
f8929630
PB
8788
8789 if (ctx->flags & IORING_SETUP_IOPOLL)
8790 mutex_unlock(&ctx->uring_lock);
b60c8dce
PB
8791 }
8792
40ae0ff7 8793 rsrc_data->do_put(ctx, prsrc);
269bbe5f 8794 kfree(prsrc);
65e19f54 8795 }
05589553 8796
28a9fe25 8797 io_rsrc_node_destroy(ref_node);
3e942498
PB
8798 if (atomic_dec_and_test(&rsrc_data->refs))
8799 complete(&rsrc_data->done);
2faf852d 8800}
65e19f54 8801
269bbe5f 8802static void io_rsrc_put_work(struct work_struct *work)
4a38aed2
JA
8803{
8804 struct io_ring_ctx *ctx;
8805 struct llist_node *node;
8806
269bbe5f
BM
8807 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
8808 node = llist_del_all(&ctx->rsrc_put_llist);
4a38aed2
JA
8809
8810 while (node) {
b895c9a6 8811 struct io_rsrc_node *ref_node;
4a38aed2
JA
8812 struct llist_node *next = node->next;
8813
b895c9a6 8814 ref_node = llist_entry(node, struct io_rsrc_node, llist);
269bbe5f 8815 __io_rsrc_put_work(ref_node);
4a38aed2
JA
8816 node = next;
8817 }
8818}
8819
6b06314c 8820static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
792e3582 8821 unsigned nr_args, u64 __user *tags)
6b06314c
JA
8822{
8823 __s32 __user *fds = (__s32 __user *) arg;
05f3fb3c 8824 struct file *file;
f3baed39 8825 int fd, ret;
846a4ef2 8826 unsigned i;
6b06314c 8827
05f3fb3c 8828 if (ctx->file_data)
6b06314c
JA
8829 return -EBUSY;
8830 if (!nr_args)
8831 return -EINVAL;
8832 if (nr_args > IORING_MAX_FIXED_FILES)
8833 return -EMFILE;
3a1b8a4e
PB
8834 if (nr_args > rlimit(RLIMIT_NOFILE))
8835 return -EMFILE;
a7f0ed5a 8836 ret = io_rsrc_node_switch_start(ctx);
f3baed39
PB
8837 if (ret)
8838 return ret;
d878c816
PB
8839 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
8840 &ctx->file_data);
8841 if (ret)
8842 return ret;
6b06314c 8843
a03a2a20
PB
8844 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
8845 io_rsrc_data_free(ctx->file_data);
8846 ctx->file_data = NULL;
8847 return -ENOMEM;
8848 }
65e19f54 8849
08a45173 8850 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
a03a2a20
PB
8851 struct io_fixed_file *file_slot;
8852
d878c816 8853 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
600cf3f8 8854 ret = -EFAULT;
a03a2a20 8855 goto fail;
600cf3f8 8856 }
08a45173 8857 /* allow sparse sets */
792e3582
PB
8858 if (fd == -1) {
8859 ret = -EINVAL;
2d091d62 8860 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
a03a2a20 8861 goto fail;
08a45173 8862 continue;
792e3582 8863 }
6b06314c 8864
05f3fb3c 8865 file = fget(fd);
6b06314c 8866 ret = -EBADF;
792e3582 8867 if (unlikely(!file))
a03a2a20 8868 goto fail;
05f3fb3c 8869
6b06314c
JA
8870 /*
8871 * Don't allow io_uring instances to be registered. If UNIX
8872 * isn't enabled, then this causes a reference cycle and this
8873 * instance can never get freed. If UNIX is enabled we'll
8874 * handle it just fine, but there's still no point in allowing
8875 * a ring fd as it doesn't support regular read/write anyway.
8876 */
05f3fb3c
JA
8877 if (file->f_op == &io_uring_fops) {
8878 fput(file);
a03a2a20
PB
8879 goto fail;
8880 }
8b3171bd 8881 ret = io_scm_file_account(ctx, file);
a03a2a20 8882 if (ret) {
a03a2a20
PB
8883 fput(file);
8884 goto fail;
6b06314c 8885 }
e390510a
PB
8886 file_slot = io_fixed_file_slot(&ctx->file_table, i);
8887 io_fixed_file_set(file_slot, file);
05589553 8888 }
6b06314c 8889
a7f0ed5a 8890 io_rsrc_node_switch(ctx, NULL);
a03a2a20
PB
8891 return 0;
8892fail:
8893 __io_sqe_files_unregister(ctx);
6b06314c
JA
8894 return ret;
8895}
8896
9c7b0ba8
PB
8897static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8898 struct io_rsrc_node *node, void *rsrc)
8899{
8f0a2480 8900 u64 *tag_slot = io_get_tag_slot(data, idx);
9c7b0ba8
PB
8901 struct io_rsrc_put *prsrc;
8902
8903 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8904 if (!prsrc)
8905 return -ENOMEM;
8906
8f0a2480
PB
8907 prsrc->tag = *tag_slot;
8908 *tag_slot = 0;
9c7b0ba8
PB
8909 prsrc->rsrc = rsrc;
8910 list_add(&prsrc->list, &node->rsrc_list);
8911 return 0;
8912}
8913
b9445598
PB
8914static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8915 unsigned int issue_flags, u32 slot_index)
8916{
8917 struct io_ring_ctx *ctx = req->ctx;
9c7b0ba8 8918 bool needs_switch = false;
b9445598
PB
8919 struct io_fixed_file *file_slot;
8920 int ret = -EBADF;
8921
f8929630 8922 io_ring_submit_lock(ctx, issue_flags);
b9445598
PB
8923 if (file->f_op == &io_uring_fops)
8924 goto err;
8925 ret = -ENXIO;
8926 if (!ctx->file_data)
8927 goto err;
8928 ret = -EINVAL;
8929 if (slot_index >= ctx->nr_user_files)
8930 goto err;
8931
8932 slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8933 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
9c7b0ba8
PB
8934
8935 if (file_slot->file_ptr) {
8936 struct file *old_file;
8937
8938 ret = io_rsrc_node_switch_start(ctx);
8939 if (ret)
8940 goto err;
8941
8942 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8943 ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
8944 ctx->rsrc_node, old_file);
8945 if (ret)
8946 goto err;
8947 file_slot->file_ptr = 0;
8948 needs_switch = true;
8949 }
b9445598 8950
8b3171bd 8951 ret = io_scm_file_account(ctx, file);
e390510a
PB
8952 if (!ret) {
8953 *io_get_tag_slot(ctx->file_data, slot_index) = 0;
8954 io_fixed_file_set(file_slot, file);
b9445598 8955 }
b9445598 8956err:
9c7b0ba8
PB
8957 if (needs_switch)
8958 io_rsrc_node_switch(ctx, ctx->file_data);
f8929630 8959 io_ring_submit_unlock(ctx, issue_flags);
b9445598
PB
8960 if (ret)
8961 fput(file);
8962 return ret;
8963}
8964
7df778be
PB
8965static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
8966{
8967 unsigned int offset = req->close.file_slot - 1;
8968 struct io_ring_ctx *ctx = req->ctx;
8969 struct io_fixed_file *file_slot;
8970 struct file *file;
4cdd158b 8971 int ret;
7df778be 8972
f8929630 8973 io_ring_submit_lock(ctx, issue_flags);
7df778be
PB
8974 ret = -ENXIO;
8975 if (unlikely(!ctx->file_data))
8976 goto out;
8977 ret = -EINVAL;
8978 if (offset >= ctx->nr_user_files)
8979 goto out;
8980 ret = io_rsrc_node_switch_start(ctx);
8981 if (ret)
8982 goto out;
8983
4cdd158b
PB
8984 offset = array_index_nospec(offset, ctx->nr_user_files);
8985 file_slot = io_fixed_file_slot(&ctx->file_table, offset);
7df778be
PB
8986 ret = -EBADF;
8987 if (!file_slot->file_ptr)
8988 goto out;
8989
8990 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8991 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
8992 if (ret)
8993 goto out;
8994
8995 file_slot->file_ptr = 0;
8996 io_rsrc_node_switch(ctx, ctx->file_data);
8997 ret = 0;
8998out:
f8929630 8999 io_ring_submit_unlock(ctx, issue_flags);
7df778be
PB
9000 return ret;
9001}
9002
05f3fb3c 9003static int __io_sqe_files_update(struct io_ring_ctx *ctx,
c3bdad02 9004 struct io_uring_rsrc_update2 *up,
05f3fb3c
JA
9005 unsigned nr_args)
9006{
c3bdad02 9007 u64 __user *tags = u64_to_user_ptr(up->tags);
98f0b3b4 9008 __s32 __user *fds = u64_to_user_ptr(up->data);
b895c9a6 9009 struct io_rsrc_data *data = ctx->file_data;
a04b0ac0
PB
9010 struct io_fixed_file *file_slot;
9011 struct file *file;
98f0b3b4
PB
9012 int fd, i, err = 0;
9013 unsigned int done;
05589553 9014 bool needs_switch = false;
c3a31e60 9015
98f0b3b4
PB
9016 if (!ctx->file_data)
9017 return -ENXIO;
9018 if (up->offset + nr_args > ctx->nr_user_files)
c3a31e60
JA
9019 return -EINVAL;
9020
67973b93 9021 for (done = 0; done < nr_args; done++) {
c3bdad02
PB
9022 u64 tag = 0;
9023
9024 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
9025 copy_from_user(&fd, &fds[done], sizeof(fd))) {
c3a31e60
JA
9026 err = -EFAULT;
9027 break;
9028 }
c3bdad02
PB
9029 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
9030 err = -EINVAL;
9031 break;
9032 }
4e0377a1 9033 if (fd == IORING_REGISTER_FILES_SKIP)
9034 continue;
9035
67973b93 9036 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
aeca241b 9037 file_slot = io_fixed_file_slot(&ctx->file_table, i);
ea64ec02 9038
a04b0ac0
PB
9039 if (file_slot->file_ptr) {
9040 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
4cdd158b 9041 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
a5318d3c
HD
9042 if (err)
9043 break;
a04b0ac0 9044 file_slot->file_ptr = 0;
05589553 9045 needs_switch = true;
c3a31e60
JA
9046 }
9047 if (fd != -1) {
c3a31e60
JA
9048 file = fget(fd);
9049 if (!file) {
9050 err = -EBADF;
9051 break;
9052 }
9053 /*
9054 * Don't allow io_uring instances to be registered. If
9055 * UNIX isn't enabled, then this causes a reference
9056 * cycle and this instance can never get freed. If UNIX
9057 * is enabled we'll handle it just fine, but there's
9058 * still no point in allowing a ring fd as it doesn't
9059 * support regular read/write anyway.
9060 */
9061 if (file->f_op == &io_uring_fops) {
9062 fput(file);
9063 err = -EBADF;
9064 break;
9065 }
8b3171bd 9066 err = io_scm_file_account(ctx, file);
f3bd9dae
YY
9067 if (err) {
9068 fput(file);
c3a31e60 9069 break;
f3bd9dae 9070 }
e390510a
PB
9071 *io_get_tag_slot(data, i) = tag;
9072 io_fixed_file_set(file_slot, file);
c3a31e60 9073 }
05f3fb3c
JA
9074 }
9075
a7f0ed5a
PB
9076 if (needs_switch)
9077 io_rsrc_node_switch(ctx, data);
c3a31e60
JA
9078 return done ? done : err;
9079}
05589553 9080
685fe7fe
JA
9081static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
9082 struct task_struct *task)
24369c2e 9083{
e941894e 9084 struct io_wq_hash *hash;
24369c2e 9085 struct io_wq_data data;
24369c2e 9086 unsigned int concurrency;
24369c2e 9087
362a9e65 9088 mutex_lock(&ctx->uring_lock);
e941894e
JA
9089 hash = ctx->hash_map;
9090 if (!hash) {
9091 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
362a9e65
YY
9092 if (!hash) {
9093 mutex_unlock(&ctx->uring_lock);
e941894e 9094 return ERR_PTR(-ENOMEM);
362a9e65 9095 }
e941894e
JA
9096 refcount_set(&hash->refs, 1);
9097 init_waitqueue_head(&hash->wait);
9098 ctx->hash_map = hash;
24369c2e 9099 }
362a9e65 9100 mutex_unlock(&ctx->uring_lock);
24369c2e 9101
e941894e 9102 data.hash = hash;
685fe7fe 9103 data.task = task;
ebc11b6c 9104 data.free_work = io_wq_free_work;
f5fa38c5 9105 data.do_work = io_wq_submit_work;
24369c2e 9106
d25e3a3d
JA
9107 /* Do QD, or 4 * CPUS, whatever is smallest */
9108 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
24369c2e 9109
5aa75ed5 9110 return io_wq_create(concurrency, &data);
24369c2e
PB
9111}
9112
c072481d
PB
9113static __cold int io_uring_alloc_task_context(struct task_struct *task,
9114 struct io_ring_ctx *ctx)
0f212204
JA
9115{
9116 struct io_uring_task *tctx;
d8a6df10 9117 int ret;
0f212204 9118
09899b19 9119 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
0f212204
JA
9120 if (unlikely(!tctx))
9121 return -ENOMEM;
9122
e7a6c00d
JA
9123 tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
9124 sizeof(struct file *), GFP_KERNEL);
9125 if (unlikely(!tctx->registered_rings)) {
9126 kfree(tctx);
9127 return -ENOMEM;
9128 }
9129
d8a6df10
JA
9130 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
9131 if (unlikely(ret)) {
e7a6c00d 9132 kfree(tctx->registered_rings);
d8a6df10
JA
9133 kfree(tctx);
9134 return ret;
9135 }
9136
685fe7fe 9137 tctx->io_wq = io_init_wq_offload(ctx, task);
5aa75ed5
JA
9138 if (IS_ERR(tctx->io_wq)) {
9139 ret = PTR_ERR(tctx->io_wq);
9140 percpu_counter_destroy(&tctx->inflight);
e7a6c00d 9141 kfree(tctx->registered_rings);
5aa75ed5
JA
9142 kfree(tctx);
9143 return ret;
9144 }
9145
0f212204
JA
9146 xa_init(&tctx->xa);
9147 init_waitqueue_head(&tctx->wait);
fdaf083c 9148 atomic_set(&tctx->in_idle, 0);
0f212204 9149 task->io_uring = tctx;
7cbf1722
JA
9150 spin_lock_init(&tctx->task_lock);
9151 INIT_WQ_LIST(&tctx->task_list);
4813c377 9152 INIT_WQ_LIST(&tctx->prior_task_list);
7cbf1722 9153 init_task_work(&tctx->task_work, tctx_task_work);
0f212204
JA
9154 return 0;
9155}
9156
9157void __io_uring_free(struct task_struct *tsk)
9158{
9159 struct io_uring_task *tctx = tsk->io_uring;
9160
9161 WARN_ON_ONCE(!xa_empty(&tctx->xa));
ef8eaa4e 9162 WARN_ON_ONCE(tctx->io_wq);
09899b19 9163 WARN_ON_ONCE(tctx->cached_refs);
ef8eaa4e 9164
e7a6c00d 9165 kfree(tctx->registered_rings);
d8a6df10 9166 percpu_counter_destroy(&tctx->inflight);
0f212204
JA
9167 kfree(tctx);
9168 tsk->io_uring = NULL;
9169}
9170
c072481d
PB
9171static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
9172 struct io_uring_params *p)
2b188cc1
JA
9173{
9174 int ret;
9175
d25e3a3d
JA
9176 /* Retain compatibility with failing for an invalid attach attempt */
9177 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
9178 IORING_SETUP_ATTACH_WQ) {
9179 struct fd f;
9180
9181 f = fdget(p->wq_fd);
9182 if (!f.file)
9183 return -ENXIO;
0cc936f7
JA
9184 if (f.file->f_op != &io_uring_fops) {
9185 fdput(f);
f2a48dd0 9186 return -EINVAL;
0cc936f7
JA
9187 }
9188 fdput(f);
d25e3a3d 9189 }
6c271ce2 9190 if (ctx->flags & IORING_SETUP_SQPOLL) {
46fe18b1 9191 struct task_struct *tsk;
534ca6d6 9192 struct io_sq_data *sqd;
26984fbf 9193 bool attached;
534ca6d6 9194
cdc1404a
PM
9195 ret = security_uring_sqpoll();
9196 if (ret)
9197 return ret;
9198
26984fbf 9199 sqd = io_get_sq_data(p, &attached);
534ca6d6
JA
9200 if (IS_ERR(sqd)) {
9201 ret = PTR_ERR(sqd);
9202 goto err;
9203 }
69fb2131 9204
7c30f36a 9205 ctx->sq_creds = get_current_cred();
534ca6d6 9206 ctx->sq_data = sqd;
917257da
JA
9207 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
9208 if (!ctx->sq_thread_idle)
9209 ctx->sq_thread_idle = HZ;
9210
78d7f6ba 9211 io_sq_thread_park(sqd);
de75a3d3
PB
9212 list_add(&ctx->sqd_list, &sqd->ctx_list);
9213 io_sqd_update_thread_idle(sqd);
26984fbf 9214 /* don't attach to a dying SQPOLL thread, would be racy */
f2a48dd0 9215 ret = (attached && !sqd->thread) ? -ENXIO : 0;
78d7f6ba
PB
9216 io_sq_thread_unpark(sqd);
9217
de75a3d3
PB
9218 if (ret < 0)
9219 goto err;
9220 if (attached)
5aa75ed5 9221 return 0;
aa06165d 9222
6c271ce2 9223 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 9224 int cpu = p->sq_thread_cpu;
6c271ce2 9225
917257da 9226 ret = -EINVAL;
f2a48dd0 9227 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
e8f98f24 9228 goto err_sqpoll;
37d1e2e3 9229 sqd->sq_cpu = cpu;
6c271ce2 9230 } else {
37d1e2e3 9231 sqd->sq_cpu = -1;
6c271ce2 9232 }
37d1e2e3
JA
9233
9234 sqd->task_pid = current->pid;
5c2469e0 9235 sqd->task_tgid = current->tgid;
46fe18b1
JA
9236 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
9237 if (IS_ERR(tsk)) {
9238 ret = PTR_ERR(tsk);
e8f98f24 9239 goto err_sqpoll;
6c271ce2 9240 }
97a73a0f 9241
46fe18b1 9242 sqd->thread = tsk;
97a73a0f 9243 ret = io_uring_alloc_task_context(tsk, ctx);
46fe18b1 9244 wake_up_new_task(tsk);
0f212204
JA
9245 if (ret)
9246 goto err;
6c271ce2
JA
9247 } else if (p->flags & IORING_SETUP_SQ_AFF) {
9248 /* Can't have SQ_AFF without SQPOLL */
9249 ret = -EINVAL;
9250 goto err;
9251 }
9252
2b188cc1 9253 return 0;
f2a48dd0
PB
9254err_sqpoll:
9255 complete(&ctx->sq_data->exited);
2b188cc1 9256err:
37d1e2e3 9257 io_sq_thread_finish(ctx);
2b188cc1
JA
9258 return ret;
9259}
9260
a087e2b5
BM
9261static inline void __io_unaccount_mem(struct user_struct *user,
9262 unsigned long nr_pages)
2b188cc1
JA
9263{
9264 atomic_long_sub(nr_pages, &user->locked_vm);
9265}
9266
a087e2b5
BM
9267static inline int __io_account_mem(struct user_struct *user,
9268 unsigned long nr_pages)
2b188cc1
JA
9269{
9270 unsigned long page_limit, cur_pages, new_pages;
9271
9272 /* Don't allow more pages than we can safely lock */
9273 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
9274
9275 do {
9276 cur_pages = atomic_long_read(&user->locked_vm);
9277 new_pages = cur_pages + nr_pages;
9278 if (new_pages > page_limit)
9279 return -ENOMEM;
9280 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
9281 new_pages) != cur_pages);
9282
9283 return 0;
9284}
9285
26bfa89e 9286static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 9287{
62e398be 9288 if (ctx->user)
a087e2b5 9289 __io_unaccount_mem(ctx->user, nr_pages);
30975825 9290
26bfa89e
JA
9291 if (ctx->mm_account)
9292 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
9293}
9294
26bfa89e 9295static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 9296{
30975825
BM
9297 int ret;
9298
62e398be 9299 if (ctx->user) {
30975825
BM
9300 ret = __io_account_mem(ctx->user, nr_pages);
9301 if (ret)
9302 return ret;
9303 }
9304
26bfa89e
JA
9305 if (ctx->mm_account)
9306 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
9307
9308 return 0;
9309}
9310
2b188cc1
JA
9311static void io_mem_free(void *ptr)
9312{
52e04ef4
MR
9313 struct page *page;
9314
9315 if (!ptr)
9316 return;
2b188cc1 9317
52e04ef4 9318 page = virt_to_head_page(ptr);
2b188cc1
JA
9319 if (put_page_testzero(page))
9320 free_compound_page(page);
9321}
9322
9323static void *io_mem_alloc(size_t size)
9324{
0a3f1e0b 9325 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
2b188cc1 9326
0a3f1e0b 9327 return (void *) __get_free_pages(gfp, get_order(size));
2b188cc1
JA
9328}
9329
75b28aff
HV
9330static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
9331 size_t *sq_offset)
9332{
9333 struct io_rings *rings;
9334 size_t off, sq_array_size;
9335
9336 off = struct_size(rings, cqes, cq_entries);
9337 if (off == SIZE_MAX)
9338 return SIZE_MAX;
9339
9340#ifdef CONFIG_SMP
9341 off = ALIGN(off, SMP_CACHE_BYTES);
9342 if (off == 0)
9343 return SIZE_MAX;
9344#endif
9345
b36200f5
DV
9346 if (sq_offset)
9347 *sq_offset = off;
9348
75b28aff
HV
9349 sq_array_size = array_size(sizeof(u32), sq_entries);
9350 if (sq_array_size == SIZE_MAX)
9351 return SIZE_MAX;
9352
9353 if (check_add_overflow(off, sq_array_size, &off))
9354 return SIZE_MAX;
9355
75b28aff
HV
9356 return off;
9357}
9358
41edf1a5 9359static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
7f61a1e9 9360{
41edf1a5 9361 struct io_mapped_ubuf *imu = *slot;
7f61a1e9
PB
9362 unsigned int i;
9363
6224843d
PB
9364 if (imu != ctx->dummy_ubuf) {
9365 for (i = 0; i < imu->nr_bvecs; i++)
9366 unpin_user_page(imu->bvec[i].bv_page);
9367 if (imu->acct_pages)
9368 io_unaccount_mem(ctx, imu->acct_pages);
9369 kvfree(imu);
9370 }
41edf1a5 9371 *slot = NULL;
7f61a1e9
PB
9372}
9373
bd54b6fe 9374static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
edafccee 9375{
634d00df
PB
9376 io_buffer_unmap(ctx, &prsrc->buf);
9377 prsrc->buf = NULL;
bd54b6fe 9378}
edafccee 9379
bd54b6fe
BM
9380static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
9381{
9382 unsigned int i;
edafccee 9383
7f61a1e9
PB
9384 for (i = 0; i < ctx->nr_user_bufs; i++)
9385 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
edafccee 9386 kfree(ctx->user_bufs);
bb6659cc 9387 io_rsrc_data_free(ctx->buf_data);
edafccee 9388 ctx->user_bufs = NULL;
bd54b6fe 9389 ctx->buf_data = NULL;
edafccee 9390 ctx->nr_user_bufs = 0;
bd54b6fe
BM
9391}
9392
0a96bbe4 9393static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
edafccee 9394{
bd54b6fe 9395 int ret;
edafccee 9396
bd54b6fe 9397 if (!ctx->buf_data)
edafccee
JA
9398 return -ENXIO;
9399
bd54b6fe
BM
9400 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
9401 if (!ret)
9402 __io_sqe_buffers_unregister(ctx);
9403 return ret;
edafccee
JA
9404}
9405
9406static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
9407 void __user *arg, unsigned index)
9408{
9409 struct iovec __user *src;
9410
9411#ifdef CONFIG_COMPAT
9412 if (ctx->compat) {
9413 struct compat_iovec __user *ciovs;
9414 struct compat_iovec ciov;
9415
9416 ciovs = (struct compat_iovec __user *) arg;
9417 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
9418 return -EFAULT;
9419
d55e5f5b 9420 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
9421 dst->iov_len = ciov.iov_len;
9422 return 0;
9423 }
9424#endif
9425 src = (struct iovec __user *) arg;
9426 if (copy_from_user(dst, &src[index], sizeof(*dst)))
9427 return -EFAULT;
9428 return 0;
9429}
9430
de293938
JA
9431/*
9432 * Not super efficient, but this is just a registration time. And we do cache
9433 * the last compound head, so generally we'll only do a full search if we don't
9434 * match that one.
9435 *
9436 * We check if the given compound head page has already been accounted, to
9437 * avoid double accounting it. This allows us to account the full size of the
9438 * page, not just the constituent pages of a huge page.
9439 */
9440static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
9441 int nr_pages, struct page *hpage)
9442{
9443 int i, j;
9444
9445 /* check current page array */
9446 for (i = 0; i < nr_pages; i++) {
9447 if (!PageCompound(pages[i]))
9448 continue;
9449 if (compound_head(pages[i]) == hpage)
9450 return true;
9451 }
9452
9453 /* check previously registered pages */
9454 for (i = 0; i < ctx->nr_user_bufs; i++) {
41edf1a5 9455 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
de293938
JA
9456
9457 for (j = 0; j < imu->nr_bvecs; j++) {
9458 if (!PageCompound(imu->bvec[j].bv_page))
9459 continue;
9460 if (compound_head(imu->bvec[j].bv_page) == hpage)
9461 return true;
9462 }
9463 }
9464
9465 return false;
9466}
9467
9468static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
9469 int nr_pages, struct io_mapped_ubuf *imu,
9470 struct page **last_hpage)
9471{
9472 int i, ret;
9473
216e5835 9474 imu->acct_pages = 0;
de293938
JA
9475 for (i = 0; i < nr_pages; i++) {
9476 if (!PageCompound(pages[i])) {
9477 imu->acct_pages++;
9478 } else {
9479 struct page *hpage;
9480
9481 hpage = compound_head(pages[i]);
9482 if (hpage == *last_hpage)
9483 continue;
9484 *last_hpage = hpage;
9485 if (headpage_already_acct(ctx, pages, i, hpage))
9486 continue;
9487 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
9488 }
9489 }
9490
9491 if (!imu->acct_pages)
9492 return 0;
9493
26bfa89e 9494 ret = io_account_mem(ctx, imu->acct_pages);
de293938
JA
9495 if (ret)
9496 imu->acct_pages = 0;
9497 return ret;
9498}
9499
0a96bbe4 9500static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
41edf1a5 9501 struct io_mapped_ubuf **pimu,
0a96bbe4 9502 struct page **last_hpage)
edafccee 9503{
41edf1a5 9504 struct io_mapped_ubuf *imu = NULL;
edafccee
JA
9505 struct vm_area_struct **vmas = NULL;
9506 struct page **pages = NULL;
0a96bbe4
BM
9507 unsigned long off, start, end, ubuf;
9508 size_t size;
9509 int ret, pret, nr_pages, i;
9510
6224843d
PB
9511 if (!iov->iov_base) {
9512 *pimu = ctx->dummy_ubuf;
9513 return 0;
9514 }
9515
0a96bbe4
BM
9516 ubuf = (unsigned long) iov->iov_base;
9517 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
9518 start = ubuf >> PAGE_SHIFT;
9519 nr_pages = end - start;
9520
41edf1a5 9521 *pimu = NULL;
0a96bbe4
BM
9522 ret = -ENOMEM;
9523
9524 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
9525 if (!pages)
9526 goto done;
9527
9528 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
9529 GFP_KERNEL);
9530 if (!vmas)
9531 goto done;
edafccee 9532
41edf1a5 9533 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
a2b4198c 9534 if (!imu)
0a96bbe4
BM
9535 goto done;
9536
9537 ret = 0;
9538 mmap_read_lock(current->mm);
9539 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
9540 pages, vmas);
9541 if (pret == nr_pages) {
9542 /* don't support file backed memory */
9543 for (i = 0; i < nr_pages; i++) {
9544 struct vm_area_struct *vma = vmas[i];
9545
40dad765
PB
9546 if (vma_is_shmem(vma))
9547 continue;
0a96bbe4
BM
9548 if (vma->vm_file &&
9549 !is_file_hugepages(vma->vm_file)) {
9550 ret = -EOPNOTSUPP;
9551 break;
9552 }
9553 }
9554 } else {
9555 ret = pret < 0 ? pret : -EFAULT;
9556 }
9557 mmap_read_unlock(current->mm);
9558 if (ret) {
9559 /*
9560 * if we did partial map, or found file backed vmas,
9561 * release any pages we did get
9562 */
9563 if (pret > 0)
9564 unpin_user_pages(pages, pret);
0a96bbe4
BM
9565 goto done;
9566 }
9567
9568 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
9569 if (ret) {
9570 unpin_user_pages(pages, pret);
0a96bbe4
BM
9571 goto done;
9572 }
9573
9574 off = ubuf & ~PAGE_MASK;
9575 size = iov->iov_len;
9576 for (i = 0; i < nr_pages; i++) {
9577 size_t vec_len;
9578
9579 vec_len = min_t(size_t, size, PAGE_SIZE - off);
9580 imu->bvec[i].bv_page = pages[i];
9581 imu->bvec[i].bv_len = vec_len;
9582 imu->bvec[i].bv_offset = off;
9583 off = 0;
9584 size -= vec_len;
9585 }
9586 /* store original address for later verification */
9587 imu->ubuf = ubuf;
4751f53d 9588 imu->ubuf_end = ubuf + iov->iov_len;
0a96bbe4 9589 imu->nr_bvecs = nr_pages;
41edf1a5 9590 *pimu = imu;
0a96bbe4
BM
9591 ret = 0;
9592done:
41edf1a5
PB
9593 if (ret)
9594 kvfree(imu);
0a96bbe4
BM
9595 kvfree(pages);
9596 kvfree(vmas);
9597 return ret;
9598}
9599
2b358604 9600static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
0a96bbe4 9601{
87094465
PB
9602 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
9603 return ctx->user_bufs ? 0 : -ENOMEM;
2b358604 9604}
edafccee 9605
2b358604
BM
9606static int io_buffer_validate(struct iovec *iov)
9607{
50e96989
PB
9608 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
9609
2b358604
BM
9610 /*
9611 * Don't impose further limits on the size and buffer
9612 * constraints here, we'll -EINVAL later when IO is
9613 * submitted if they are wrong.
9614 */
6224843d
PB
9615 if (!iov->iov_base)
9616 return iov->iov_len ? -EFAULT : 0;
9617 if (!iov->iov_len)
2b358604 9618 return -EFAULT;
edafccee 9619
2b358604
BM
9620 /* arbitrary limit, but we need something */
9621 if (iov->iov_len > SZ_1G)
9622 return -EFAULT;
edafccee 9623
50e96989
PB
9624 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
9625 return -EOVERFLOW;
9626
2b358604
BM
9627 return 0;
9628}
edafccee 9629
2b358604 9630static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
634d00df 9631 unsigned int nr_args, u64 __user *tags)
2b358604 9632{
bd54b6fe
BM
9633 struct page *last_hpage = NULL;
9634 struct io_rsrc_data *data;
2b358604
BM
9635 int i, ret;
9636 struct iovec iov;
edafccee 9637
87094465
PB
9638 if (ctx->user_bufs)
9639 return -EBUSY;
489809e2 9640 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
87094465 9641 return -EINVAL;
bd54b6fe 9642 ret = io_rsrc_node_switch_start(ctx);
2b358604
BM
9643 if (ret)
9644 return ret;
d878c816
PB
9645 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9646 if (ret)
9647 return ret;
bd54b6fe
BM
9648 ret = io_buffers_map_alloc(ctx, nr_args);
9649 if (ret) {
bb6659cc 9650 io_rsrc_data_free(data);
bd54b6fe
BM
9651 return ret;
9652 }
edafccee 9653
87094465 9654 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
edafccee
JA
9655 ret = io_copy_iov(ctx, &iov, arg, i);
9656 if (ret)
0a96bbe4 9657 break;
2b358604
BM
9658 ret = io_buffer_validate(&iov);
9659 if (ret)
0a96bbe4 9660 break;
2d091d62 9661 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
cf3770e7
CIK
9662 ret = -EINVAL;
9663 break;
9664 }
edafccee 9665
41edf1a5
PB
9666 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9667 &last_hpage);
0a96bbe4
BM
9668 if (ret)
9669 break;
edafccee 9670 }
0a96bbe4 9671
bd54b6fe 9672 WARN_ON_ONCE(ctx->buf_data);
0a96bbe4 9673
bd54b6fe
BM
9674 ctx->buf_data = data;
9675 if (ret)
9676 __io_sqe_buffers_unregister(ctx);
9677 else
9678 io_rsrc_node_switch(ctx, NULL);
edafccee
JA
9679 return ret;
9680}
9681
634d00df
PB
9682static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9683 struct io_uring_rsrc_update2 *up,
9684 unsigned int nr_args)
9685{
9686 u64 __user *tags = u64_to_user_ptr(up->tags);
9687 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
634d00df
PB
9688 struct page *last_hpage = NULL;
9689 bool needs_switch = false;
9690 __u32 done;
9691 int i, err;
9692
9693 if (!ctx->buf_data)
9694 return -ENXIO;
9695 if (up->offset + nr_args > ctx->nr_user_bufs)
9696 return -EINVAL;
9697
9698 for (done = 0; done < nr_args; done++) {
0b8c0e7c
PB
9699 struct io_mapped_ubuf *imu;
9700 int offset = up->offset + done;
634d00df
PB
9701 u64 tag = 0;
9702
9703 err = io_copy_iov(ctx, &iov, iovs, done);
9704 if (err)
9705 break;
9706 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9707 err = -EFAULT;
9708 break;
9709 }
0b8c0e7c
PB
9710 err = io_buffer_validate(&iov);
9711 if (err)
9712 break;
cf3770e7
CIK
9713 if (!iov.iov_base && tag) {
9714 err = -EINVAL;
9715 break;
9716 }
0b8c0e7c
PB
9717 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9718 if (err)
9719 break;
634d00df 9720
0b8c0e7c 9721 i = array_index_nospec(offset, ctx->nr_user_bufs);
6224843d 9722 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
4cdd158b 9723 err = io_queue_rsrc_removal(ctx->buf_data, i,
0b8c0e7c
PB
9724 ctx->rsrc_node, ctx->user_bufs[i]);
9725 if (unlikely(err)) {
9726 io_buffer_unmap(ctx, &imu);
634d00df 9727 break;
0b8c0e7c 9728 }
634d00df
PB
9729 ctx->user_bufs[i] = NULL;
9730 needs_switch = true;
9731 }
9732
0b8c0e7c 9733 ctx->user_bufs[i] = imu;
2d091d62 9734 *io_get_tag_slot(ctx->buf_data, offset) = tag;
634d00df
PB
9735 }
9736
9737 if (needs_switch)
9738 io_rsrc_node_switch(ctx, ctx->buf_data);
9739 return done ? done : err;
9740}
9741
c75312dd
UA
9742static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
9743 unsigned int eventfd_async)
9b402849 9744{
77bc59b4 9745 struct io_ev_fd *ev_fd;
9b402849 9746 __s32 __user *fds = arg;
f0a4e62b 9747 int fd;
9b402849 9748
77bc59b4
UA
9749 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
9750 lockdep_is_held(&ctx->uring_lock));
9751 if (ev_fd)
9b402849
JA
9752 return -EBUSY;
9753
9754 if (copy_from_user(&fd, fds, sizeof(*fds)))
9755 return -EFAULT;
9756
77bc59b4
UA
9757 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
9758 if (!ev_fd)
9759 return -ENOMEM;
fe7e3257 9760
77bc59b4
UA
9761 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
9762 if (IS_ERR(ev_fd->cq_ev_fd)) {
f0a4e62b 9763 int ret = PTR_ERR(ev_fd->cq_ev_fd);
77bc59b4 9764 kfree(ev_fd);
9b402849
JA
9765 return ret;
9766 }
c75312dd 9767 ev_fd->eventfd_async = eventfd_async;
9aa8dfde 9768 ctx->has_evfd = true;
77bc59b4 9769 rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
f0a4e62b 9770 return 0;
77bc59b4
UA
9771}
9772
9773static void io_eventfd_put(struct rcu_head *rcu)
9774{
9775 struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
9776
9777 eventfd_ctx_put(ev_fd->cq_ev_fd);
9778 kfree(ev_fd);
9b402849
JA
9779}
9780
9781static int io_eventfd_unregister(struct io_ring_ctx *ctx)
9782{
77bc59b4
UA
9783 struct io_ev_fd *ev_fd;
9784
9785 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
9786 lockdep_is_held(&ctx->uring_lock));
9787 if (ev_fd) {
9aa8dfde 9788 ctx->has_evfd = false;
77bc59b4
UA
9789 rcu_assign_pointer(ctx->io_ev_fd, NULL);
9790 call_rcu(&ev_fd->rcu, io_eventfd_put);
9b402849
JA
9791 return 0;
9792 }
9793
9794 return -ENXIO;
9795}
9796
5a2e745d
JA
9797static void io_destroy_buffers(struct io_ring_ctx *ctx)
9798{
dbc7d452
JA
9799 int i;
9800
9801 for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
9802 struct list_head *list = &ctx->io_buffers[i];
9e15c3a0 9803
dbc7d452
JA
9804 while (!list_empty(list)) {
9805 struct io_buffer_list *bl;
9806
9807 bl = list_first_entry(list, struct io_buffer_list, list);
9808 __io_remove_buffers(ctx, bl, -1U);
9809 list_del(&bl->list);
9810 kfree(bl);
9811 }
9812 }
cc3cec83
JA
9813
9814 while (!list_empty(&ctx->io_buffers_pages)) {
9815 struct page *page;
9816
9817 page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
9818 list_del_init(&page->lru);
9819 __free_page(page);
9820 }
5a2e745d
JA
9821}
9822
4010fec4 9823static void io_req_caches_free(struct io_ring_ctx *ctx)
2b188cc1 9824{
cd0ca2e0 9825 struct io_submit_state *state = &ctx->submit_state;
37f0e767 9826 int nr = 0;
bf019da7 9827
9a4fdbd8 9828 mutex_lock(&ctx->uring_lock);
cd0ca2e0 9829 io_flush_cached_locked_reqs(ctx, state);
9a4fdbd8 9830
88ab95be 9831 while (!io_req_cache_empty(ctx)) {
c2b6c6bc
PB
9832 struct io_wq_work_node *node;
9833 struct io_kiocb *req;
9a4fdbd8 9834
c2b6c6bc
PB
9835 node = wq_stack_extract(&state->free_list);
9836 req = container_of(node, struct io_kiocb, comp_list);
9837 kmem_cache_free(req_cachep, req);
37f0e767 9838 nr++;
c2b6c6bc 9839 }
37f0e767
PB
9840 if (nr)
9841 percpu_ref_put_many(&ctx->refs, nr);
9a4fdbd8
JA
9842 mutex_unlock(&ctx->uring_lock);
9843}
9844
43597aac 9845static void io_wait_rsrc_data(struct io_rsrc_data *data)
2b188cc1 9846{
43597aac 9847 if (data && !atomic_dec_and_test(&data->refs))
bd54b6fe 9848 wait_for_completion(&data->done);
bd54b6fe 9849}
04fc6c80 9850
4d9237e3
JA
9851static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
9852{
9853 struct async_poll *apoll;
9854
9855 while (!list_empty(&ctx->apoll_cache)) {
9856 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
9857 poll.wait.entry);
9858 list_del(&apoll->poll.wait.entry);
9859 kfree(apoll);
9860 }
9861}
9862
c072481d 9863static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
2b188cc1 9864{
37d1e2e3 9865 io_sq_thread_finish(ctx);
2aede0e4 9866
37d1e2e3 9867 if (ctx->mm_account) {
2aede0e4
JA
9868 mmdrop(ctx->mm_account);
9869 ctx->mm_account = NULL;
30975825 9870 }
def596e9 9871
ab409402 9872 io_rsrc_refs_drop(ctx);
43597aac
PB
9873 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
9874 io_wait_rsrc_data(ctx->buf_data);
9875 io_wait_rsrc_data(ctx->file_data);
9876
8bad28d8 9877 mutex_lock(&ctx->uring_lock);
43597aac 9878 if (ctx->buf_data)
bd54b6fe 9879 __io_sqe_buffers_unregister(ctx);
43597aac 9880 if (ctx->file_data)
08480400 9881 __io_sqe_files_unregister(ctx);
c4ea060e
PB
9882 if (ctx->rings)
9883 __io_cqring_overflow_flush(ctx, true);
9b402849 9884 io_eventfd_unregister(ctx);
4d9237e3 9885 io_flush_apoll_cache(ctx);
77bc59b4 9886 mutex_unlock(&ctx->uring_lock);
5a2e745d 9887 io_destroy_buffers(ctx);
07db298a
PB
9888 if (ctx->sq_creds)
9889 put_cred(ctx->sq_creds);
def596e9 9890
a7f0ed5a
PB
9891 /* there are no registered resources left, nobody uses it */
9892 if (ctx->rsrc_node)
9893 io_rsrc_node_destroy(ctx->rsrc_node);
8dd03afe 9894 if (ctx->rsrc_backup_node)
b895c9a6 9895 io_rsrc_node_destroy(ctx->rsrc_backup_node);
a7f0ed5a 9896 flush_delayed_work(&ctx->rsrc_put_work);
756ab7c0 9897 flush_delayed_work(&ctx->fallback_work);
a7f0ed5a
PB
9898
9899 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
9900 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
def596e9 9901
2b188cc1 9902#if defined(CONFIG_UNIX)
355e8d26
EB
9903 if (ctx->ring_sock) {
9904 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 9905 sock_release(ctx->ring_sock);
355e8d26 9906 }
2b188cc1 9907#endif
ef9dd637 9908 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
2b188cc1 9909
75b28aff 9910 io_mem_free(ctx->rings);
2b188cc1 9911 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
9912
9913 percpu_ref_exit(&ctx->refs);
2b188cc1 9914 free_uid(ctx->user);
4010fec4 9915 io_req_caches_free(ctx);
e941894e
JA
9916 if (ctx->hash_map)
9917 io_wq_put_hash(ctx->hash_map);
78076bb6 9918 kfree(ctx->cancel_hash);
6224843d 9919 kfree(ctx->dummy_ubuf);
dbc7d452 9920 kfree(ctx->io_buffers);
2b188cc1
JA
9921 kfree(ctx);
9922}
9923
9924static __poll_t io_uring_poll(struct file *file, poll_table *wait)
9925{
9926 struct io_ring_ctx *ctx = file->private_data;
9927 __poll_t mask = 0;
9928
d60aa65b 9929 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
9930 /*
9931 * synchronizes with barrier from wq_has_sleeper call in
9932 * io_commit_cqring
9933 */
2b188cc1 9934 smp_rmb();
90554200 9935 if (!io_sqring_full(ctx))
2b188cc1 9936 mask |= EPOLLOUT | EPOLLWRNORM;
ed670c3f
HX
9937
9938 /*
9939 * Don't flush cqring overflow list here, just do a simple check.
9940 * Otherwise there could possible be ABBA deadlock:
9941 * CPU0 CPU1
9942 * ---- ----
9943 * lock(&ctx->uring_lock);
9944 * lock(&ep->mtx);
9945 * lock(&ctx->uring_lock);
9946 * lock(&ep->mtx);
9947 *
9948 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
9949 * pushs them to do the flush.
9950 */
5ed7a37d 9951 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
2b188cc1
JA
9952 mask |= EPOLLIN | EPOLLRDNORM;
9953
9954 return mask;
9955}
9956
0bead8cd 9957static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
071698e1 9958{
4379bf8b 9959 const struct cred *creds;
071698e1 9960
61cf9370 9961 creds = xa_erase(&ctx->personalities, id);
4379bf8b
JA
9962 if (creds) {
9963 put_cred(creds);
0bead8cd 9964 return 0;
1e6fa521 9965 }
0bead8cd
YD
9966
9967 return -EINVAL;
9968}
9969
d56d938b
PB
9970struct io_tctx_exit {
9971 struct callback_head task_work;
9972 struct completion completion;
baf186c4 9973 struct io_ring_ctx *ctx;
d56d938b
PB
9974};
9975
c072481d 9976static __cold void io_tctx_exit_cb(struct callback_head *cb)
d56d938b
PB
9977{
9978 struct io_uring_task *tctx = current->io_uring;
9979 struct io_tctx_exit *work;
9980
9981 work = container_of(cb, struct io_tctx_exit, task_work);
9982 /*
9983 * When @in_idle, we're in cancellation and it's racy to remove the
9984 * node. It'll be removed by the end of cancellation, just ignore it.
9985 */
9986 if (!atomic_read(&tctx->in_idle))
eef51daa 9987 io_uring_del_tctx_node((unsigned long)work->ctx);
d56d938b
PB
9988 complete(&work->completion);
9989}
9990
c072481d 9991static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
28090c13
PB
9992{
9993 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9994
9995 return req->ctx == data;
9996}
9997
c072481d 9998static __cold void io_ring_exit_work(struct work_struct *work)
85faa7b8 9999{
d56d938b 10000 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
b5bb3a24 10001 unsigned long timeout = jiffies + HZ * 60 * 5;
58d3be2c 10002 unsigned long interval = HZ / 20;
d56d938b
PB
10003 struct io_tctx_exit exit;
10004 struct io_tctx_node *node;
10005 int ret;
85faa7b8 10006
56952e91
JA
10007 /*
10008 * If we're doing polled IO and end up having requests being
10009 * submitted async (out-of-line), then completions can come in while
10010 * we're waiting for refs to drop. We need to reap these manually,
10011 * as nobody else will be looking for them.
10012 */
b2edc0a7 10013 do {
3dd0c97a 10014 io_uring_try_cancel_requests(ctx, NULL, true);
28090c13
PB
10015 if (ctx->sq_data) {
10016 struct io_sq_data *sqd = ctx->sq_data;
10017 struct task_struct *tsk;
10018
10019 io_sq_thread_park(sqd);
10020 tsk = sqd->thread;
10021 if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
10022 io_wq_cancel_cb(tsk->io_uring->io_wq,
10023 io_cancel_ctx_cb, ctx, true);
10024 io_sq_thread_unpark(sqd);
10025 }
b5bb3a24 10026
37f0e767
PB
10027 io_req_caches_free(ctx);
10028
58d3be2c
PB
10029 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
10030 /* there is little hope left, don't run it too often */
10031 interval = HZ * 60;
10032 }
10033 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
d56d938b 10034
7f00651a
PB
10035 init_completion(&exit.completion);
10036 init_task_work(&exit.task_work, io_tctx_exit_cb);
10037 exit.ctx = ctx;
89b5066e
PB
10038 /*
10039 * Some may use context even when all refs and requests have been put,
10040 * and they are free to do so while still holding uring_lock or
5b0a6acc 10041 * completion_lock, see io_req_task_submit(). Apart from other work,
89b5066e
PB
10042 * this lock/unlock section also waits them to finish.
10043 */
d56d938b
PB
10044 mutex_lock(&ctx->uring_lock);
10045 while (!list_empty(&ctx->tctx_list)) {
b5bb3a24
PB
10046 WARN_ON_ONCE(time_after(jiffies, timeout));
10047
d56d938b
PB
10048 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
10049 ctx_node);
7f00651a
PB
10050 /* don't spin on a single task if cancellation failed */
10051 list_rotate_left(&ctx->tctx_list);
d56d938b
PB
10052 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
10053 if (WARN_ON_ONCE(ret))
10054 continue;
d56d938b
PB
10055
10056 mutex_unlock(&ctx->uring_lock);
10057 wait_for_completion(&exit.completion);
d56d938b
PB
10058 mutex_lock(&ctx->uring_lock);
10059 }
10060 mutex_unlock(&ctx->uring_lock);
79ebeaee
JA
10061 spin_lock(&ctx->completion_lock);
10062 spin_unlock(&ctx->completion_lock);
d56d938b 10063
85faa7b8
JA
10064 io_ring_ctx_free(ctx);
10065}
10066
80c4cbdb 10067/* Returns true if we found and killed one or more timeouts */
c072481d
PB
10068static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
10069 struct task_struct *tsk, bool cancel_all)
80c4cbdb
PB
10070{
10071 struct io_kiocb *req, *tmp;
10072 int canceled = 0;
10073
79ebeaee
JA
10074 spin_lock(&ctx->completion_lock);
10075 spin_lock_irq(&ctx->timeout_lock);
80c4cbdb 10076 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
3dd0c97a 10077 if (io_match_task(req, tsk, cancel_all)) {
80c4cbdb
PB
10078 io_kill_timeout(req, -ECANCELED);
10079 canceled++;
10080 }
10081 }
79ebeaee 10082 spin_unlock_irq(&ctx->timeout_lock);
60053be8 10083 io_commit_cqring(ctx);
79ebeaee 10084 spin_unlock(&ctx->completion_lock);
80c4cbdb
PB
10085 if (canceled != 0)
10086 io_cqring_ev_posted(ctx);
10087 return canceled != 0;
10088}
10089
c072481d 10090static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2b188cc1 10091{
61cf9370
MWO
10092 unsigned long index;
10093 struct creds *creds;
10094
2b188cc1
JA
10095 mutex_lock(&ctx->uring_lock);
10096 percpu_ref_kill(&ctx->refs);
634578f8 10097 if (ctx->rings)
6c2450ae 10098 __io_cqring_overflow_flush(ctx, true);
61cf9370
MWO
10099 xa_for_each(&ctx->personalities, index, creds)
10100 io_unregister_personality(ctx, index);
2b188cc1
JA
10101 mutex_unlock(&ctx->uring_lock);
10102
60053be8
PB
10103 /* failed during ring init, it couldn't have issued any requests */
10104 if (ctx->rings) {
10105 io_kill_timeouts(ctx, NULL, true);
10106 io_poll_remove_all(ctx, NULL, true);
10107 /* if we failed setting up the ctx, we might not have any rings */
10108 io_iopoll_try_reap_events(ctx);
10109 }
309fc03a 10110
85faa7b8 10111 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
fc666777
JA
10112 /*
10113 * Use system_unbound_wq to avoid spawning tons of event kworkers
10114 * if we're exiting a ton of rings at the same time. It just adds
10115 * noise and overhead, there's no discernable change in runtime
10116 * over using system_wq.
10117 */
10118 queue_work(system_unbound_wq, &ctx->exit_work);
2b188cc1
JA
10119}
10120
10121static int io_uring_release(struct inode *inode, struct file *file)
10122{
10123 struct io_ring_ctx *ctx = file->private_data;
10124
10125 file->private_data = NULL;
10126 io_ring_ctx_wait_and_kill(ctx);
10127 return 0;
10128}
10129
f6edbabb
PB
10130struct io_task_cancel {
10131 struct task_struct *task;
3dd0c97a 10132 bool all;
f6edbabb 10133};
f254ac04 10134
f6edbabb 10135static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
b711d4ea 10136{
9a472ef7 10137 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f6edbabb 10138 struct io_task_cancel *cancel = data;
9a472ef7 10139
6af3f48b 10140 return io_match_task_safe(req, cancel->task, cancel->all);
b711d4ea
JA
10141}
10142
c072481d
PB
10143static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
10144 struct task_struct *task,
10145 bool cancel_all)
b7ddce3c 10146{
e1915f76 10147 struct io_defer_entry *de;
b7ddce3c
PB
10148 LIST_HEAD(list);
10149
79ebeaee 10150 spin_lock(&ctx->completion_lock);
b7ddce3c 10151 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
6af3f48b 10152 if (io_match_task_safe(de->req, task, cancel_all)) {
b7ddce3c
PB
10153 list_cut_position(&list, &ctx->defer_list, &de->list);
10154 break;
10155 }
10156 }
79ebeaee 10157 spin_unlock(&ctx->completion_lock);
e1915f76
PB
10158 if (list_empty(&list))
10159 return false;
b7ddce3c
PB
10160
10161 while (!list_empty(&list)) {
10162 de = list_first_entry(&list, struct io_defer_entry, list);
10163 list_del_init(&de->list);
f41db273 10164 io_req_complete_failed(de->req, -ECANCELED);
b7ddce3c
PB
10165 kfree(de);
10166 }
e1915f76 10167 return true;
b7ddce3c
PB
10168}
10169
c072481d 10170static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
1b00764f
PB
10171{
10172 struct io_tctx_node *node;
10173 enum io_wq_cancel cret;
10174 bool ret = false;
10175
10176 mutex_lock(&ctx->uring_lock);
10177 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
10178 struct io_uring_task *tctx = node->task->io_uring;
10179
10180 /*
10181 * io_wq will stay alive while we hold uring_lock, because it's
10182 * killed after ctx nodes, which requires to take the lock.
10183 */
10184 if (!tctx || !tctx->io_wq)
10185 continue;
10186 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
10187 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
10188 }
10189 mutex_unlock(&ctx->uring_lock);
10190
10191 return ret;
10192}
10193
c072481d
PB
10194static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
10195 struct task_struct *task,
10196 bool cancel_all)
9936c7c2 10197{
3dd0c97a 10198 struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
1b00764f 10199 struct io_uring_task *tctx = task ? task->io_uring : NULL;
9936c7c2 10200
60053be8
PB
10201 /* failed during ring init, it couldn't have issued any requests */
10202 if (!ctx->rings)
10203 return;
10204
9936c7c2
PB
10205 while (1) {
10206 enum io_wq_cancel cret;
10207 bool ret = false;
10208
1b00764f
PB
10209 if (!task) {
10210 ret |= io_uring_try_cancel_iowq(ctx);
10211 } else if (tctx && tctx->io_wq) {
10212 /*
10213 * Cancels requests of all rings, not only @ctx, but
10214 * it's fine as the task is in exit/exec.
10215 */
5aa75ed5 10216 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9936c7c2
PB
10217 &cancel, true);
10218 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
10219 }
10220
10221 /* SQPOLL thread does its own polling */
3dd0c97a 10222 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
d052d1d6 10223 (ctx->sq_data && ctx->sq_data->thread == current)) {
5eef4e87 10224 while (!wq_list_empty(&ctx->iopoll_list)) {
9936c7c2
PB
10225 io_iopoll_try_reap_events(ctx);
10226 ret = true;
10227 }
10228 }
10229
3dd0c97a
PB
10230 ret |= io_cancel_defer_files(ctx, task, cancel_all);
10231 ret |= io_poll_remove_all(ctx, task, cancel_all);
10232 ret |= io_kill_timeouts(ctx, task, cancel_all);
e5dc480d
PB
10233 if (task)
10234 ret |= io_run_task_work();
9936c7c2
PB
10235 if (!ret)
10236 break;
10237 cond_resched();
10238 }
10239}
10240
eef51daa 10241static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
0f212204 10242{
236434c3 10243 struct io_uring_task *tctx = current->io_uring;
13bf43f5 10244 struct io_tctx_node *node;
a528b04e 10245 int ret;
236434c3
MWO
10246
10247 if (unlikely(!tctx)) {
5aa75ed5 10248 ret = io_uring_alloc_task_context(current, ctx);
0f212204
JA
10249 if (unlikely(ret))
10250 return ret;
e139a1ec 10251
236434c3 10252 tctx = current->io_uring;
e139a1ec
PB
10253 if (ctx->iowq_limits_set) {
10254 unsigned int limits[2] = { ctx->iowq_limits[0],
10255 ctx->iowq_limits[1], };
10256
10257 ret = io_wq_max_workers(tctx->io_wq, limits);
10258 if (ret)
10259 return ret;
10260 }
0f212204 10261 }
cf27f3b1
PB
10262 if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
10263 node = kmalloc(sizeof(*node), GFP_KERNEL);
10264 if (!node)
10265 return -ENOMEM;
10266 node->ctx = ctx;
10267 node->task = current;
13bf43f5 10268
cf27f3b1
PB
10269 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
10270 node, GFP_KERNEL));
10271 if (ret) {
10272 kfree(node);
10273 return ret;
0f212204 10274 }
cf27f3b1
PB
10275
10276 mutex_lock(&ctx->uring_lock);
10277 list_add(&node->ctx_node, &ctx->tctx_list);
10278 mutex_unlock(&ctx->uring_lock);
0f212204 10279 }
cf27f3b1 10280 tctx->last = ctx;
0f212204
JA
10281 return 0;
10282}
10283
cf27f3b1
PB
10284/*
10285 * Note that this task has used io_uring. We use it for cancelation purposes.
10286 */
eef51daa 10287static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
cf27f3b1
PB
10288{
10289 struct io_uring_task *tctx = current->io_uring;
10290
10291 if (likely(tctx && tctx->last == ctx))
10292 return 0;
eef51daa 10293 return __io_uring_add_tctx_node(ctx);
cf27f3b1
PB
10294}
10295
0f212204
JA
10296/*
10297 * Remove this io_uring_file -> task mapping.
10298 */
c072481d 10299static __cold void io_uring_del_tctx_node(unsigned long index)
0f212204
JA
10300{
10301 struct io_uring_task *tctx = current->io_uring;
13bf43f5 10302 struct io_tctx_node *node;
2941267b 10303
eebd2e37
PB
10304 if (!tctx)
10305 return;
13bf43f5
PB
10306 node = xa_erase(&tctx->xa, index);
10307 if (!node)
2941267b 10308 return;
0f212204 10309
13bf43f5
PB
10310 WARN_ON_ONCE(current != node->task);
10311 WARN_ON_ONCE(list_empty(&node->ctx_node));
10312
10313 mutex_lock(&node->ctx->uring_lock);
10314 list_del(&node->ctx_node);
10315 mutex_unlock(&node->ctx->uring_lock);
10316
baf186c4 10317 if (tctx->last == node->ctx)
0f212204 10318 tctx->last = NULL;
13bf43f5 10319 kfree(node);
0f212204
JA
10320}
10321
c072481d 10322static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
de7f1d9e 10323{
ba5ef6dc 10324 struct io_wq *wq = tctx->io_wq;
13bf43f5 10325 struct io_tctx_node *node;
de7f1d9e
PB
10326 unsigned long index;
10327
8bab4c09 10328 xa_for_each(&tctx->xa, index, node) {
eef51daa 10329 io_uring_del_tctx_node(index);
8bab4c09
JA
10330 cond_resched();
10331 }
b16ef427
ME
10332 if (wq) {
10333 /*
f6f9b278 10334 * Must be after io_uring_del_tctx_node() (removes nodes under
b16ef427
ME
10335 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
10336 */
ba5ef6dc 10337 io_wq_put_and_exit(wq);
dadebc35 10338 tctx->io_wq = NULL;
b16ef427 10339 }
de7f1d9e
PB
10340}
10341
3f48cf18 10342static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
521d6a73 10343{
3f48cf18 10344 if (tracked)
d5361233 10345 return 0;
521d6a73
PB
10346 return percpu_counter_sum(&tctx->inflight);
10347}
10348
78cc687b
PB
10349/*
10350 * Find any io_uring ctx that this task has registered or done IO on, and cancel
78a78060 10351 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
78cc687b 10352 */
c072481d
PB
10353static __cold void io_uring_cancel_generic(bool cancel_all,
10354 struct io_sq_data *sqd)
0e9ddb39 10355{
521d6a73 10356 struct io_uring_task *tctx = current->io_uring;
734551df 10357 struct io_ring_ctx *ctx;
0e9ddb39
PB
10358 s64 inflight;
10359 DEFINE_WAIT(wait);
fdaf083c 10360
78cc687b
PB
10361 WARN_ON_ONCE(sqd && sqd->thread != current);
10362
6d042ffb
PO
10363 if (!current->io_uring)
10364 return;
17a91051
PB
10365 if (tctx->io_wq)
10366 io_wq_exit_start(tctx->io_wq);
10367
0e9ddb39
PB
10368 atomic_inc(&tctx->in_idle);
10369 do {
e9dbe221 10370 io_uring_drop_tctx_refs(current);
0e9ddb39 10371 /* read completions before cancelations */
78cc687b 10372 inflight = tctx_inflight(tctx, !cancel_all);
0e9ddb39
PB
10373 if (!inflight)
10374 break;
fdaf083c 10375
78cc687b
PB
10376 if (!sqd) {
10377 struct io_tctx_node *node;
10378 unsigned long index;
0f212204 10379
78cc687b
PB
10380 xa_for_each(&tctx->xa, index, node) {
10381 /* sqpoll task will cancel all its requests */
10382 if (node->ctx->sq_data)
10383 continue;
10384 io_uring_try_cancel_requests(node->ctx, current,
10385 cancel_all);
10386 }
10387 } else {
10388 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
10389 io_uring_try_cancel_requests(ctx, current,
10390 cancel_all);
10391 }
17a91051 10392
78a78060
JA
10393 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
10394 io_run_task_work();
e9dbe221 10395 io_uring_drop_tctx_refs(current);
78a78060 10396
0f212204 10397 /*
a1bb3cd5
PB
10398 * If we've seen completions, retry without waiting. This
10399 * avoids a race where a completion comes in before we did
10400 * prepare_to_wait().
0f212204 10401 */
3dd0c97a 10402 if (inflight == tctx_inflight(tctx, !cancel_all))
a1bb3cd5 10403 schedule();
f57555ed 10404 finish_wait(&tctx->wait, &wait);
d8a6df10 10405 } while (1);
de7f1d9e 10406
8452d4a6 10407 io_uring_clean_tctx(tctx);
3dd0c97a 10408 if (cancel_all) {
3cc7fdb9
PB
10409 /*
10410 * We shouldn't run task_works after cancel, so just leave
10411 * ->in_idle set for normal exit.
10412 */
10413 atomic_dec(&tctx->in_idle);
3f48cf18
PB
10414 /* for exec all current's requests should be gone, kill tctx */
10415 __io_uring_free(current);
10416 }
44e728b8
PB
10417}
10418
f552a27a 10419void __io_uring_cancel(bool cancel_all)
78cc687b 10420{
f552a27a 10421 io_uring_cancel_generic(cancel_all, NULL);
78cc687b
PB
10422}
10423
e7a6c00d
JA
10424void io_uring_unreg_ringfd(void)
10425{
10426 struct io_uring_task *tctx = current->io_uring;
10427 int i;
10428
10429 for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
10430 if (tctx->registered_rings[i]) {
10431 fput(tctx->registered_rings[i]);
10432 tctx->registered_rings[i] = NULL;
10433 }
10434 }
10435}
10436
10437static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
10438 int start, int end)
10439{
10440 struct file *file;
10441 int offset;
10442
10443 for (offset = start; offset < end; offset++) {
10444 offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
10445 if (tctx->registered_rings[offset])
10446 continue;
10447
10448 file = fget(fd);
10449 if (!file) {
10450 return -EBADF;
10451 } else if (file->f_op != &io_uring_fops) {
10452 fput(file);
10453 return -EOPNOTSUPP;
10454 }
10455 tctx->registered_rings[offset] = file;
10456 return offset;
10457 }
10458
10459 return -EBUSY;
10460}
10461
10462/*
10463 * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
10464 * invocation. User passes in an array of struct io_uring_rsrc_update
10465 * with ->data set to the ring_fd, and ->offset given for the desired
10466 * index. If no index is desired, application may set ->offset == -1U
10467 * and we'll find an available index. Returns number of entries
10468 * successfully processed, or < 0 on error if none were processed.
10469 */
10470static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
10471 unsigned nr_args)
10472{
10473 struct io_uring_rsrc_update __user *arg = __arg;
10474 struct io_uring_rsrc_update reg;
10475 struct io_uring_task *tctx;
10476 int ret, i;
10477
10478 if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
10479 return -EINVAL;
10480
10481 mutex_unlock(&ctx->uring_lock);
10482 ret = io_uring_add_tctx_node(ctx);
10483 mutex_lock(&ctx->uring_lock);
10484 if (ret)
10485 return ret;
10486
10487 tctx = current->io_uring;
10488 for (i = 0; i < nr_args; i++) {
10489 int start, end;
10490
10491 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
10492 ret = -EFAULT;
10493 break;
10494 }
10495
6fb53cf8
DY
10496 if (reg.resv) {
10497 ret = -EINVAL;
10498 break;
10499 }
10500
e7a6c00d
JA
10501 if (reg.offset == -1U) {
10502 start = 0;
10503 end = IO_RINGFD_REG_MAX;
10504 } else {
10505 if (reg.offset >= IO_RINGFD_REG_MAX) {
10506 ret = -EINVAL;
10507 break;
10508 }
10509 start = reg.offset;
10510 end = start + 1;
10511 }
10512
10513 ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
10514 if (ret < 0)
10515 break;
10516
10517 reg.offset = ret;
10518 if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
10519 fput(tctx->registered_rings[reg.offset]);
10520 tctx->registered_rings[reg.offset] = NULL;
10521 ret = -EFAULT;
10522 break;
10523 }
10524 }
10525
10526 return i ? i : ret;
10527}
10528
10529static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
10530 unsigned nr_args)
10531{
10532 struct io_uring_rsrc_update __user *arg = __arg;
10533 struct io_uring_task *tctx = current->io_uring;
10534 struct io_uring_rsrc_update reg;
10535 int ret = 0, i;
10536
10537 if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
10538 return -EINVAL;
10539 if (!tctx)
10540 return 0;
10541
10542 for (i = 0; i < nr_args; i++) {
10543 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
10544 ret = -EFAULT;
10545 break;
10546 }
6fb53cf8 10547 if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) {
e7a6c00d
JA
10548 ret = -EINVAL;
10549 break;
10550 }
10551
10552 reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
10553 if (tctx->registered_rings[reg.offset]) {
10554 fput(tctx->registered_rings[reg.offset]);
10555 tctx->registered_rings[reg.offset] = NULL;
10556 }
10557 }
10558
10559 return i ? i : ret;
10560}
10561
6c5c240e
RP
10562static void *io_uring_validate_mmap_request(struct file *file,
10563 loff_t pgoff, size_t sz)
2b188cc1 10564{
2b188cc1 10565 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 10566 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
10567 struct page *page;
10568 void *ptr;
10569
10570 switch (offset) {
10571 case IORING_OFF_SQ_RING:
75b28aff
HV
10572 case IORING_OFF_CQ_RING:
10573 ptr = ctx->rings;
2b188cc1
JA
10574 break;
10575 case IORING_OFF_SQES:
10576 ptr = ctx->sq_sqes;
10577 break;
2b188cc1 10578 default:
6c5c240e 10579 return ERR_PTR(-EINVAL);
2b188cc1
JA
10580 }
10581
10582 page = virt_to_head_page(ptr);
a50b854e 10583 if (sz > page_size(page))
6c5c240e
RP
10584 return ERR_PTR(-EINVAL);
10585
10586 return ptr;
10587}
10588
10589#ifdef CONFIG_MMU
10590
c072481d 10591static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6c5c240e
RP
10592{
10593 size_t sz = vma->vm_end - vma->vm_start;
10594 unsigned long pfn;
10595 void *ptr;
10596
10597 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
10598 if (IS_ERR(ptr))
10599 return PTR_ERR(ptr);
2b188cc1
JA
10600
10601 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
10602 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
10603}
10604
6c5c240e
RP
10605#else /* !CONFIG_MMU */
10606
10607static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
10608{
10609 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
10610}
10611
10612static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
10613{
10614 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
10615}
10616
10617static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
10618 unsigned long addr, unsigned long len,
10619 unsigned long pgoff, unsigned long flags)
10620{
10621 void *ptr;
10622
10623 ptr = io_uring_validate_mmap_request(file, pgoff, len);
10624 if (IS_ERR(ptr))
10625 return PTR_ERR(ptr);
10626
10627 return (unsigned long) ptr;
10628}
10629
10630#endif /* !CONFIG_MMU */
10631
d9d05217 10632static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
90554200
JA
10633{
10634 DEFINE_WAIT(wait);
10635
10636 do {
10637 if (!io_sqring_full(ctx))
10638 break;
90554200
JA
10639 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
10640
10641 if (!io_sqring_full(ctx))
10642 break;
90554200
JA
10643 schedule();
10644 } while (!signal_pending(current));
10645
10646 finish_wait(&ctx->sqo_sq_wait, &wait);
5199328a 10647 return 0;
90554200
JA
10648}
10649
f81440d3
PB
10650static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
10651{
10652 if (flags & IORING_ENTER_EXT_ARG) {
10653 struct io_uring_getevents_arg arg;
10654
10655 if (argsz != sizeof(arg))
10656 return -EINVAL;
10657 if (copy_from_user(&arg, argp, sizeof(arg)))
10658 return -EFAULT;
10659 }
10660 return 0;
10661}
10662
c73ebb68
HX
10663static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
10664 struct __kernel_timespec __user **ts,
10665 const sigset_t __user **sig)
10666{
10667 struct io_uring_getevents_arg arg;
10668
10669 /*
10670 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
10671 * is just a pointer to the sigset_t.
10672 */
10673 if (!(flags & IORING_ENTER_EXT_ARG)) {
10674 *sig = (const sigset_t __user *) argp;
10675 *ts = NULL;
10676 return 0;
10677 }
10678
10679 /*
10680 * EXT_ARG is set - ensure we agree on the size of it and copy in our
10681 * timespec and sigset_t pointers if good.
10682 */
10683 if (*argsz != sizeof(arg))
10684 return -EINVAL;
10685 if (copy_from_user(&arg, argp, sizeof(arg)))
10686 return -EFAULT;
d2347b96
DY
10687 if (arg.pad)
10688 return -EINVAL;
c73ebb68
HX
10689 *sig = u64_to_user_ptr(arg.sigmask);
10690 *argsz = arg.sigmask_sz;
10691 *ts = u64_to_user_ptr(arg.ts);
10692 return 0;
10693}
10694
2b188cc1 10695SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
c73ebb68
HX
10696 u32, min_complete, u32, flags, const void __user *, argp,
10697 size_t, argsz)
2b188cc1
JA
10698{
10699 struct io_ring_ctx *ctx;
2b188cc1
JA
10700 int submitted = 0;
10701 struct fd f;
33f993da 10702 long ret;
2b188cc1 10703
4c6e277c 10704 io_run_task_work();
b41e9852 10705
33f993da 10706 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
e7a6c00d
JA
10707 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
10708 IORING_ENTER_REGISTERED_RING)))
2b188cc1
JA
10709 return -EINVAL;
10710
e7a6c00d
JA
10711 /*
10712 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
10713 * need only dereference our task private array to find it.
10714 */
10715 if (flags & IORING_ENTER_REGISTERED_RING) {
10716 struct io_uring_task *tctx = current->io_uring;
10717
10718 if (!tctx || fd >= IO_RINGFD_REG_MAX)
10719 return -EINVAL;
10720 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
10721 f.file = tctx->registered_rings[fd];
10722 if (unlikely(!f.file))
10723 return -EBADF;
10724 } else {
10725 f = fdget(fd);
10726 if (unlikely(!f.file))
10727 return -EBADF;
10728 }
2b188cc1
JA
10729
10730 ret = -EOPNOTSUPP;
33f993da 10731 if (unlikely(f.file->f_op != &io_uring_fops))
2b188cc1
JA
10732 goto out_fput;
10733
10734 ret = -ENXIO;
10735 ctx = f.file->private_data;
33f993da 10736 if (unlikely(!percpu_ref_tryget(&ctx->refs)))
2b188cc1
JA
10737 goto out_fput;
10738
7e84e1c7 10739 ret = -EBADFD;
33f993da 10740 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
7e84e1c7
SG
10741 goto out;
10742
6c271ce2
JA
10743 /*
10744 * For SQ polling, the thread will do all submissions and completions.
10745 * Just return the requested submit count, and wake the thread if
10746 * we were asked to.
10747 */
b2a9eada 10748 ret = 0;
6c271ce2 10749 if (ctx->flags & IORING_SETUP_SQPOLL) {
90f67366 10750 io_cqring_overflow_flush(ctx);
89448c47 10751
21f96522
JA
10752 if (unlikely(ctx->sq_data->thread == NULL)) {
10753 ret = -EOWNERDEAD;
04147488 10754 goto out;
21f96522 10755 }
6c271ce2 10756 if (flags & IORING_ENTER_SQ_WAKEUP)
534ca6d6 10757 wake_up(&ctx->sq_data->wait);
d9d05217
PB
10758 if (flags & IORING_ENTER_SQ_WAIT) {
10759 ret = io_sqpoll_wait_sq(ctx);
10760 if (ret)
10761 goto out;
10762 }
6c271ce2 10763 submitted = to_submit;
b2a9eada 10764 } else if (to_submit) {
eef51daa 10765 ret = io_uring_add_tctx_node(ctx);
0f212204
JA
10766 if (unlikely(ret))
10767 goto out;
d487b43c 10768
2b188cc1 10769 mutex_lock(&ctx->uring_lock);
0f212204 10770 submitted = io_submit_sqes(ctx, to_submit);
d487b43c
PB
10771 if (submitted != to_submit) {
10772 mutex_unlock(&ctx->uring_lock);
7c504e65 10773 goto out;
d487b43c
PB
10774 }
10775 if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
10776 goto iopoll_locked;
10777 mutex_unlock(&ctx->uring_lock);
2b188cc1
JA
10778 }
10779 if (flags & IORING_ENTER_GETEVENTS) {
773697b6 10780 if (ctx->syscall_iopoll) {
d487b43c
PB
10781 /*
10782 * We disallow the app entering submit/complete with
10783 * polling, but we still need to lock the ring to
10784 * prevent racing with polled issue that got punted to
10785 * a workqueue.
10786 */
10787 mutex_lock(&ctx->uring_lock);
10788iopoll_locked:
f81440d3 10789 ret = io_validate_ext_arg(flags, argp, argsz);
d487b43c
PB
10790 if (likely(!ret)) {
10791 min_complete = min(min_complete, ctx->cq_entries);
10792 ret = io_iopoll_check(ctx, min_complete);
10793 }
10794 mutex_unlock(&ctx->uring_lock);
def596e9 10795 } else {
f81440d3
PB
10796 const sigset_t __user *sig;
10797 struct __kernel_timespec __user *ts;
10798
10799 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
10800 if (unlikely(ret))
10801 goto out;
d487b43c 10802 min_complete = min(min_complete, ctx->cq_entries);
c73ebb68 10803 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
def596e9 10804 }
2b188cc1
JA
10805 }
10806
7c504e65 10807out:
6805b32e 10808 percpu_ref_put(&ctx->refs);
2b188cc1 10809out_fput:
e7a6c00d
JA
10810 if (!(flags & IORING_ENTER_REGISTERED_RING))
10811 fdput(f);
2b188cc1
JA
10812 return submitted ? submitted : ret;
10813}
10814
bebdb65e 10815#ifdef CONFIG_PROC_FS
c072481d 10816static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
61cf9370 10817 const struct cred *cred)
87ce955b 10818{
87ce955b
JA
10819 struct user_namespace *uns = seq_user_ns(m);
10820 struct group_info *gi;
10821 kernel_cap_t cap;
10822 unsigned __capi;
10823 int g;
10824
10825 seq_printf(m, "%5d\n", id);
10826 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10827 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10828 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10829 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10830 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10831 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10832 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10833 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10834 seq_puts(m, "\n\tGroups:\t");
10835 gi = cred->group_info;
10836 for (g = 0; g < gi->ngroups; g++) {
10837 seq_put_decimal_ull(m, g ? " " : "",
10838 from_kgid_munged(uns, gi->gid[g]));
10839 }
10840 seq_puts(m, "\n\tCapEff:\t");
10841 cap = cred->cap_effective;
10842 CAP_FOR_EACH_U32(__capi)
10843 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10844 seq_putc(m, '\n');
10845 return 0;
10846}
10847
c072481d
PB
10848static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
10849 struct seq_file *m)
87ce955b 10850{
dbbe9c64 10851 struct io_sq_data *sq = NULL;
83f84356
HX
10852 struct io_overflow_cqe *ocqe;
10853 struct io_rings *r = ctx->rings;
10854 unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
83f84356
HX
10855 unsigned int sq_head = READ_ONCE(r->sq.head);
10856 unsigned int sq_tail = READ_ONCE(r->sq.tail);
10857 unsigned int cq_head = READ_ONCE(r->cq.head);
10858 unsigned int cq_tail = READ_ONCE(r->cq.tail);
f75d1183 10859 unsigned int sq_entries, cq_entries;
fad8e0de 10860 bool has_lock;
83f84356
HX
10861 unsigned int i;
10862
10863 /*
10864 * we may get imprecise sqe and cqe info if uring is actively running
10865 * since we get cached_sq_head and cached_cq_tail without uring_lock
10866 * and sq_tail and cq_head are changed by userspace. But it's ok since
10867 * we usually use these info when it is stuck.
10868 */
c0235652 10869 seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
f75d1183
JA
10870 seq_printf(m, "SqHead:\t%u\n", sq_head);
10871 seq_printf(m, "SqTail:\t%u\n", sq_tail);
10872 seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
10873 seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
10874 seq_printf(m, "CqHead:\t%u\n", cq_head);
10875 seq_printf(m, "CqTail:\t%u\n", cq_tail);
10876 seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
10877 seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
10878 sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
10879 for (i = 0; i < sq_entries; i++) {
10880 unsigned int entry = i + sq_head;
10881 unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
a1957780 10882 struct io_uring_sqe *sqe;
f75d1183
JA
10883
10884 if (sq_idx > sq_mask)
10885 continue;
10886 sqe = &ctx->sq_sqes[sq_idx];
10887 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10888 sq_idx, sqe->opcode, sqe->fd, sqe->flags,
10889 sqe->user_data);
83f84356 10890 }
f75d1183
JA
10891 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
10892 cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
10893 for (i = 0; i < cq_entries; i++) {
10894 unsigned int entry = i + cq_head;
10895 struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
83f84356
HX
10896
10897 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
f75d1183
JA
10898 entry & cq_mask, cqe->user_data, cqe->res,
10899 cqe->flags);
83f84356 10900 }
87ce955b 10901
fad8e0de
JA
10902 /*
10903 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10904 * since fdinfo case grabs it in the opposite direction of normal use
10905 * cases. If we fail to get the lock, we just don't iterate any
10906 * structures that could be going away outside the io_uring mutex.
10907 */
10908 has_lock = mutex_trylock(&ctx->uring_lock);
10909
5f3f26f9 10910 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
dbbe9c64 10911 sq = ctx->sq_data;
5f3f26f9
JA
10912 if (!sq->thread)
10913 sq = NULL;
10914 }
dbbe9c64
JQ
10915
10916 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10917 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
87ce955b 10918 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
fad8e0de 10919 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
7b29f92d 10920 struct file *f = io_file_from_index(ctx, i);
87ce955b 10921
87ce955b
JA
10922 if (f)
10923 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10924 else
10925 seq_printf(m, "%5u: <none>\n", i);
10926 }
10927 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
fad8e0de 10928 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
41edf1a5 10929 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
4751f53d 10930 unsigned int len = buf->ubuf_end - buf->ubuf;
87ce955b 10931
4751f53d 10932 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
87ce955b 10933 }
61cf9370
MWO
10934 if (has_lock && !xa_empty(&ctx->personalities)) {
10935 unsigned long index;
10936 const struct cred *cred;
10937
87ce955b 10938 seq_printf(m, "Personalities:\n");
61cf9370
MWO
10939 xa_for_each(&ctx->personalities, index, cred)
10940 io_uring_show_cred(m, index, cred);
87ce955b 10941 }
83f84356
HX
10942 if (has_lock)
10943 mutex_unlock(&ctx->uring_lock);
10944
10945 seq_puts(m, "PollList:\n");
79ebeaee 10946 spin_lock(&ctx->completion_lock);
d7718a9d
JA
10947 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10948 struct hlist_head *list = &ctx->cancel_hash[i];
10949 struct io_kiocb *req;
10950
10951 hlist_for_each_entry(req, list, hash_node)
10952 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
7f62d40d 10953 task_work_pending(req->task));
d7718a9d 10954 }
83f84356
HX
10955
10956 seq_puts(m, "CqOverflowList:\n");
10957 list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10958 struct io_uring_cqe *cqe = &ocqe->cqe;
10959
10960 seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
10961 cqe->user_data, cqe->res, cqe->flags);
10962
10963 }
10964
79ebeaee 10965 spin_unlock(&ctx->completion_lock);
87ce955b
JA
10966}
10967
c072481d 10968static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
87ce955b
JA
10969{
10970 struct io_ring_ctx *ctx = f->private_data;
10971
10972 if (percpu_ref_tryget(&ctx->refs)) {
10973 __io_uring_show_fdinfo(ctx, m);
10974 percpu_ref_put(&ctx->refs);
10975 }
10976}
bebdb65e 10977#endif
87ce955b 10978
2b188cc1
JA
10979static const struct file_operations io_uring_fops = {
10980 .release = io_uring_release,
10981 .mmap = io_uring_mmap,
6c5c240e
RP
10982#ifndef CONFIG_MMU
10983 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
10984 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
10985#endif
2b188cc1 10986 .poll = io_uring_poll,
bebdb65e 10987#ifdef CONFIG_PROC_FS
87ce955b 10988 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 10989#endif
2b188cc1
JA
10990};
10991
c072481d
PB
10992static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10993 struct io_uring_params *p)
2b188cc1 10994{
75b28aff
HV
10995 struct io_rings *rings;
10996 size_t size, sq_array_offset;
2b188cc1 10997
bd740481
JA
10998 /* make sure these are sane, as we already accounted them */
10999 ctx->sq_entries = p->sq_entries;
11000 ctx->cq_entries = p->cq_entries;
11001
75b28aff
HV
11002 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
11003 if (size == SIZE_MAX)
11004 return -EOVERFLOW;
11005
11006 rings = io_mem_alloc(size);
11007 if (!rings)
2b188cc1
JA
11008 return -ENOMEM;
11009
75b28aff
HV
11010 ctx->rings = rings;
11011 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
11012 rings->sq_ring_mask = p->sq_entries - 1;
11013 rings->cq_ring_mask = p->cq_entries - 1;
11014 rings->sq_ring_entries = p->sq_entries;
11015 rings->cq_ring_entries = p->cq_entries;
2b188cc1
JA
11016
11017 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
11018 if (size == SIZE_MAX) {
11019 io_mem_free(ctx->rings);
11020 ctx->rings = NULL;
2b188cc1 11021 return -EOVERFLOW;
eb065d30 11022 }
2b188cc1
JA
11023
11024 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
11025 if (!ctx->sq_sqes) {
11026 io_mem_free(ctx->rings);
11027 ctx->rings = NULL;
2b188cc1 11028 return -ENOMEM;
eb065d30 11029 }
2b188cc1 11030
2b188cc1
JA
11031 return 0;
11032}
11033
9faadcc8
PB
11034static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
11035{
11036 int ret, fd;
11037
11038 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
11039 if (fd < 0)
11040 return fd;
11041
eef51daa 11042 ret = io_uring_add_tctx_node(ctx);
9faadcc8
PB
11043 if (ret) {
11044 put_unused_fd(fd);
11045 return ret;
11046 }
11047 fd_install(fd, file);
11048 return fd;
11049}
11050
2b188cc1
JA
11051/*
11052 * Allocate an anonymous fd, this is what constitutes the application
11053 * visible backing of an io_uring instance. The application mmaps this
11054 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
11055 * we have to tie this fd to a socket for file garbage collection purposes.
11056 */
9faadcc8 11057static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
2b188cc1
JA
11058{
11059 struct file *file;
9faadcc8 11060#if defined(CONFIG_UNIX)
2b188cc1
JA
11061 int ret;
11062
2b188cc1
JA
11063 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
11064 &ctx->ring_sock);
11065 if (ret)
9faadcc8 11066 return ERR_PTR(ret);
2b188cc1
JA
11067#endif
11068
91a9ab7c
PM
11069 file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
11070 O_RDWR | O_CLOEXEC, NULL);
2b188cc1 11071#if defined(CONFIG_UNIX)
9faadcc8
PB
11072 if (IS_ERR(file)) {
11073 sock_release(ctx->ring_sock);
11074 ctx->ring_sock = NULL;
11075 } else {
11076 ctx->ring_sock->file = file;
0f212204 11077 }
2b188cc1 11078#endif
9faadcc8 11079 return file;
2b188cc1
JA
11080}
11081
c072481d
PB
11082static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
11083 struct io_uring_params __user *params)
2b188cc1 11084{
2b188cc1 11085 struct io_ring_ctx *ctx;
9faadcc8 11086 struct file *file;
2b188cc1
JA
11087 int ret;
11088
8110c1a6 11089 if (!entries)
2b188cc1 11090 return -EINVAL;
8110c1a6
JA
11091 if (entries > IORING_MAX_ENTRIES) {
11092 if (!(p->flags & IORING_SETUP_CLAMP))
11093 return -EINVAL;
11094 entries = IORING_MAX_ENTRIES;
11095 }
2b188cc1
JA
11096
11097 /*
11098 * Use twice as many entries for the CQ ring. It's possible for the
11099 * application to drive a higher depth than the size of the SQ ring,
11100 * since the sqes are only used at submission time. This allows for
33a107f0
JA
11101 * some flexibility in overcommitting a bit. If the application has
11102 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
11103 * of CQ ring entries manually.
2b188cc1
JA
11104 */
11105 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
11106 if (p->flags & IORING_SETUP_CQSIZE) {
11107 /*
11108 * If IORING_SETUP_CQSIZE is set, we do the same roundup
11109 * to a power-of-two, if it isn't already. We do NOT impose
11110 * any cq vs sq ring sizing.
11111 */
eb2667b3 11112 if (!p->cq_entries)
33a107f0 11113 return -EINVAL;
8110c1a6
JA
11114 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
11115 if (!(p->flags & IORING_SETUP_CLAMP))
11116 return -EINVAL;
11117 p->cq_entries = IORING_MAX_CQ_ENTRIES;
11118 }
eb2667b3
JQ
11119 p->cq_entries = roundup_pow_of_two(p->cq_entries);
11120 if (p->cq_entries < p->sq_entries)
11121 return -EINVAL;
33a107f0
JA
11122 } else {
11123 p->cq_entries = 2 * p->sq_entries;
11124 }
2b188cc1 11125
2b188cc1 11126 ctx = io_ring_ctx_alloc(p);
62e398be 11127 if (!ctx)
2b188cc1 11128 return -ENOMEM;
773697b6
PB
11129
11130 /*
11131 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
11132 * space applications don't need to do io completion events
11133 * polling again, they can rely on io_sq_thread to do polling
11134 * work, which can reduce cpu usage and uring_lock contention.
11135 */
11136 if (ctx->flags & IORING_SETUP_IOPOLL &&
11137 !(ctx->flags & IORING_SETUP_SQPOLL))
11138 ctx->syscall_iopoll = 1;
11139
2b188cc1 11140 ctx->compat = in_compat_syscall();
62e398be
JA
11141 if (!capable(CAP_IPC_LOCK))
11142 ctx->user = get_uid(current_user());
2aede0e4
JA
11143
11144 /*
11145 * This is just grabbed for accounting purposes. When a process exits,
11146 * the mm is exited and dropped before the files, hence we need to hang
11147 * on to this mm purely for the purposes of being able to unaccount
11148 * memory (locked/pinned vm). It's not used for anything else.
11149 */
6b7898eb 11150 mmgrab(current->mm);
2aede0e4 11151 ctx->mm_account = current->mm;
6b7898eb 11152
2b188cc1
JA
11153 ret = io_allocate_scq_urings(ctx, p);
11154 if (ret)
11155 goto err;
11156
7e84e1c7 11157 ret = io_sq_offload_create(ctx, p);
2b188cc1
JA
11158 if (ret)
11159 goto err;
eae071c9 11160 /* always set a rsrc node */
47b228ce
PB
11161 ret = io_rsrc_node_switch_start(ctx);
11162 if (ret)
11163 goto err;
eae071c9 11164 io_rsrc_node_switch(ctx, NULL);
2b188cc1 11165
2b188cc1 11166 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
11167 p->sq_off.head = offsetof(struct io_rings, sq.head);
11168 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
11169 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
11170 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
11171 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
11172 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
11173 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
11174
11175 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
11176 p->cq_off.head = offsetof(struct io_rings, cq.head);
11177 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
11178 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
11179 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
11180 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
11181 p->cq_off.cqes = offsetof(struct io_rings, cqes);
0d9b5b3a 11182 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
ac90f249 11183
7f13657d
XW
11184 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
11185 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
5769a351 11186 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
c73ebb68 11187 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9690557e 11188 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
c4212f3e
JA
11189 IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
11190 IORING_FEAT_LINKED_FILE;
7f13657d
XW
11191
11192 if (copy_to_user(params, p, sizeof(*p))) {
11193 ret = -EFAULT;
11194 goto err;
11195 }
d1719f70 11196
9faadcc8
PB
11197 file = io_uring_get_file(ctx);
11198 if (IS_ERR(file)) {
11199 ret = PTR_ERR(file);
11200 goto err;
11201 }
11202
044c1ab3
JA
11203 /*
11204 * Install ring fd as the very last thing, so we don't risk someone
11205 * having closed it before we finish setup
11206 */
9faadcc8
PB
11207 ret = io_uring_install_fd(ctx, file);
11208 if (ret < 0) {
11209 /* fput will clean it up */
11210 fput(file);
11211 return ret;
11212 }
044c1ab3 11213
c826bd7a 11214 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
11215 return ret;
11216err:
11217 io_ring_ctx_wait_and_kill(ctx);
11218 return ret;
11219}
11220
11221/*
11222 * Sets up an aio uring context, and returns the fd. Applications asks for a
11223 * ring size, we return the actual sq/cq ring sizes (among other things) in the
11224 * params structure passed in.
11225 */
11226static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
11227{
11228 struct io_uring_params p;
2b188cc1
JA
11229 int i;
11230
11231 if (copy_from_user(&p, params, sizeof(p)))
11232 return -EFAULT;
11233 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
11234 if (p.resv[i])
11235 return -EINVAL;
11236 }
11237
6c271ce2 11238 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 11239 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7e84e1c7 11240 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
bcbb7bf6 11241 IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
2b188cc1
JA
11242 return -EINVAL;
11243
7f13657d 11244 return io_uring_create(entries, &p, params);
2b188cc1
JA
11245}
11246
11247SYSCALL_DEFINE2(io_uring_setup, u32, entries,
11248 struct io_uring_params __user *, params)
11249{
11250 return io_uring_setup(entries, params);
11251}
11252
c072481d
PB
11253static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
11254 unsigned nr_args)
66f4af93
JA
11255{
11256 struct io_uring_probe *p;
11257 size_t size;
11258 int i, ret;
11259
11260 size = struct_size(p, ops, nr_args);
11261 if (size == SIZE_MAX)
11262 return -EOVERFLOW;
11263 p = kzalloc(size, GFP_KERNEL);
11264 if (!p)
11265 return -ENOMEM;
11266
11267 ret = -EFAULT;
11268 if (copy_from_user(p, arg, size))
11269 goto out;
11270 ret = -EINVAL;
11271 if (memchr_inv(p, 0, size))
11272 goto out;
11273
11274 p->last_op = IORING_OP_LAST - 1;
11275 if (nr_args > IORING_OP_LAST)
11276 nr_args = IORING_OP_LAST;
11277
11278 for (i = 0; i < nr_args; i++) {
11279 p->ops[i].op = i;
11280 if (!io_op_defs[i].not_supported)
11281 p->ops[i].flags = IO_URING_OP_SUPPORTED;
11282 }
11283 p->ops_len = i;
11284
11285 ret = 0;
11286 if (copy_to_user(arg, p, size))
11287 ret = -EFAULT;
11288out:
11289 kfree(p);
11290 return ret;
11291}
11292
071698e1
JA
11293static int io_register_personality(struct io_ring_ctx *ctx)
11294{
4379bf8b 11295 const struct cred *creds;
61cf9370 11296 u32 id;
1e6fa521 11297 int ret;
071698e1 11298
4379bf8b 11299 creds = get_current_cred();
1e6fa521 11300
61cf9370
MWO
11301 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
11302 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
a30f895a
JA
11303 if (ret < 0) {
11304 put_cred(creds);
11305 return ret;
11306 }
11307 return id;
071698e1
JA
11308}
11309
c072481d
PB
11310static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
11311 void __user *arg, unsigned int nr_args)
21b55dbc
SG
11312{
11313 struct io_uring_restriction *res;
11314 size_t size;
11315 int i, ret;
11316
7e84e1c7
SG
11317 /* Restrictions allowed only if rings started disabled */
11318 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
11319 return -EBADFD;
11320
21b55dbc 11321 /* We allow only a single restrictions registration */
7e84e1c7 11322 if (ctx->restrictions.registered)
21b55dbc
SG
11323 return -EBUSY;
11324
11325 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
11326 return -EINVAL;
11327
11328 size = array_size(nr_args, sizeof(*res));
11329 if (size == SIZE_MAX)
11330 return -EOVERFLOW;
11331
11332 res = memdup_user(arg, size);
11333 if (IS_ERR(res))
11334 return PTR_ERR(res);
11335
11336 ret = 0;
11337
11338 for (i = 0; i < nr_args; i++) {
11339 switch (res[i].opcode) {
11340 case IORING_RESTRICTION_REGISTER_OP:
11341 if (res[i].register_op >= IORING_REGISTER_LAST) {
11342 ret = -EINVAL;
11343 goto out;
11344 }
11345
11346 __set_bit(res[i].register_op,
11347 ctx->restrictions.register_op);
11348 break;
11349 case IORING_RESTRICTION_SQE_OP:
11350 if (res[i].sqe_op >= IORING_OP_LAST) {
11351 ret = -EINVAL;
11352 goto out;
11353 }
11354
11355 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
11356 break;
11357 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
11358 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
11359 break;
11360 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
11361 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
11362 break;
11363 default:
11364 ret = -EINVAL;
11365 goto out;
11366 }
11367 }
11368
11369out:
11370 /* Reset all restrictions if an error happened */
11371 if (ret != 0)
11372 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
11373 else
7e84e1c7 11374 ctx->restrictions.registered = true;
21b55dbc
SG
11375
11376 kfree(res);
11377 return ret;
11378}
11379
7e84e1c7
SG
11380static int io_register_enable_rings(struct io_ring_ctx *ctx)
11381{
11382 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
11383 return -EBADFD;
11384
11385 if (ctx->restrictions.registered)
11386 ctx->restricted = 1;
11387
0298ef96
PB
11388 ctx->flags &= ~IORING_SETUP_R_DISABLED;
11389 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
11390 wake_up(&ctx->sq_data->wait);
7e84e1c7
SG
11391 return 0;
11392}
11393
fdecb662 11394static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 11395 struct io_uring_rsrc_update2 *up,
98f0b3b4
PB
11396 unsigned nr_args)
11397{
11398 __u32 tmp;
11399 int err;
11400
11401 if (check_add_overflow(up->offset, nr_args, &tmp))
11402 return -EOVERFLOW;
11403 err = io_rsrc_node_switch_start(ctx);
11404 if (err)
11405 return err;
11406
fdecb662
PB
11407 switch (type) {
11408 case IORING_RSRC_FILE:
98f0b3b4 11409 return __io_sqe_files_update(ctx, up, nr_args);
634d00df
PB
11410 case IORING_RSRC_BUFFER:
11411 return __io_sqe_buffers_update(ctx, up, nr_args);
98f0b3b4
PB
11412 }
11413 return -EINVAL;
11414}
11415
c3bdad02
PB
11416static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
11417 unsigned nr_args)
98f0b3b4 11418{
c3bdad02 11419 struct io_uring_rsrc_update2 up;
98f0b3b4
PB
11420
11421 if (!nr_args)
11422 return -EINVAL;
c3bdad02
PB
11423 memset(&up, 0, sizeof(up));
11424 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
11425 return -EFAULT;
d8a3ba9c 11426 if (up.resv || up.resv2)
565c5e61 11427 return -EINVAL;
c3bdad02
PB
11428 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
11429}
11430
11431static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
992da01a 11432 unsigned size, unsigned type)
c3bdad02
PB
11433{
11434 struct io_uring_rsrc_update2 up;
11435
11436 if (size != sizeof(up))
11437 return -EINVAL;
98f0b3b4
PB
11438 if (copy_from_user(&up, arg, sizeof(up)))
11439 return -EFAULT;
d8a3ba9c 11440 if (!up.nr || up.resv || up.resv2)
98f0b3b4 11441 return -EINVAL;
992da01a 11442 return __io_register_rsrc_update(ctx, type, &up, up.nr);
98f0b3b4
PB
11443}
11444
c072481d 11445static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
992da01a 11446 unsigned int size, unsigned int type)
792e3582
PB
11447{
11448 struct io_uring_rsrc_register rr;
11449
11450 /* keep it extendible */
11451 if (size != sizeof(rr))
11452 return -EINVAL;
11453
11454 memset(&rr, 0, sizeof(rr));
11455 if (copy_from_user(&rr, arg, size))
11456 return -EFAULT;
992da01a 11457 if (!rr.nr || rr.resv || rr.resv2)
792e3582
PB
11458 return -EINVAL;
11459
992da01a 11460 switch (type) {
792e3582
PB
11461 case IORING_RSRC_FILE:
11462 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
11463 rr.nr, u64_to_user_ptr(rr.tags));
634d00df
PB
11464 case IORING_RSRC_BUFFER:
11465 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
11466 rr.nr, u64_to_user_ptr(rr.tags));
792e3582
PB
11467 }
11468 return -EINVAL;
11469}
11470
c072481d
PB
11471static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
11472 void __user *arg, unsigned len)
fe76421d
JA
11473{
11474 struct io_uring_task *tctx = current->io_uring;
11475 cpumask_var_t new_mask;
11476 int ret;
11477
11478 if (!tctx || !tctx->io_wq)
11479 return -EINVAL;
11480
11481 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
11482 return -ENOMEM;
11483
11484 cpumask_clear(new_mask);
11485 if (len > cpumask_size())
11486 len = cpumask_size();
11487
0f5e4b83
ES
11488 if (in_compat_syscall()) {
11489 ret = compat_get_bitmap(cpumask_bits(new_mask),
11490 (const compat_ulong_t __user *)arg,
11491 len * 8 /* CHAR_BIT */);
11492 } else {
11493 ret = copy_from_user(new_mask, arg, len);
11494 }
11495
11496 if (ret) {
fe76421d
JA
11497 free_cpumask_var(new_mask);
11498 return -EFAULT;
11499 }
11500
11501 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
11502 free_cpumask_var(new_mask);
11503 return ret;
11504}
11505
c072481d 11506static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
fe76421d
JA
11507{
11508 struct io_uring_task *tctx = current->io_uring;
11509
11510 if (!tctx || !tctx->io_wq)
11511 return -EINVAL;
11512
11513 return io_wq_cpu_affinity(tctx->io_wq, NULL);
11514}
11515
c072481d
PB
11516static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
11517 void __user *arg)
b22fa62a 11518 __must_hold(&ctx->uring_lock)
2e480058 11519{
b22fa62a 11520 struct io_tctx_node *node;
fa84693b
JA
11521 struct io_uring_task *tctx = NULL;
11522 struct io_sq_data *sqd = NULL;
2e480058
JA
11523 __u32 new_count[2];
11524 int i, ret;
11525
2e480058
JA
11526 if (copy_from_user(new_count, arg, sizeof(new_count)))
11527 return -EFAULT;
11528 for (i = 0; i < ARRAY_SIZE(new_count); i++)
11529 if (new_count[i] > INT_MAX)
11530 return -EINVAL;
11531
fa84693b
JA
11532 if (ctx->flags & IORING_SETUP_SQPOLL) {
11533 sqd = ctx->sq_data;
11534 if (sqd) {
009ad9f0
JA
11535 /*
11536 * Observe the correct sqd->lock -> ctx->uring_lock
11537 * ordering. Fine to drop uring_lock here, we hold
11538 * a ref to the ctx.
11539 */
41d3a6bd 11540 refcount_inc(&sqd->refs);
009ad9f0 11541 mutex_unlock(&ctx->uring_lock);
fa84693b 11542 mutex_lock(&sqd->lock);
009ad9f0 11543 mutex_lock(&ctx->uring_lock);
41d3a6bd
JA
11544 if (sqd->thread)
11545 tctx = sqd->thread->io_uring;
fa84693b
JA
11546 }
11547 } else {
11548 tctx = current->io_uring;
11549 }
11550
e139a1ec 11551 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
fa84693b 11552
bad119b9
PB
11553 for (i = 0; i < ARRAY_SIZE(new_count); i++)
11554 if (new_count[i])
11555 ctx->iowq_limits[i] = new_count[i];
e139a1ec
PB
11556 ctx->iowq_limits_set = true;
11557
e139a1ec
PB
11558 if (tctx && tctx->io_wq) {
11559 ret = io_wq_max_workers(tctx->io_wq, new_count);
11560 if (ret)
11561 goto err;
11562 } else {
11563 memset(new_count, 0, sizeof(new_count));
11564 }
fa84693b 11565
41d3a6bd 11566 if (sqd) {
fa84693b 11567 mutex_unlock(&sqd->lock);
41d3a6bd
JA
11568 io_put_sq_data(sqd);
11569 }
2e480058
JA
11570
11571 if (copy_to_user(arg, new_count, sizeof(new_count)))
11572 return -EFAULT;
11573
b22fa62a
PB
11574 /* that's it for SQPOLL, only the SQPOLL task creates requests */
11575 if (sqd)
11576 return 0;
11577
11578 /* now propagate the restriction to all registered users */
11579 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
11580 struct io_uring_task *tctx = node->task->io_uring;
11581
11582 if (WARN_ON_ONCE(!tctx->io_wq))
11583 continue;
11584
11585 for (i = 0; i < ARRAY_SIZE(new_count); i++)
11586 new_count[i] = ctx->iowq_limits[i];
11587 /* ignore errors, it always returns zero anyway */
11588 (void)io_wq_max_workers(tctx->io_wq, new_count);
11589 }
2e480058 11590 return 0;
fa84693b 11591err:
41d3a6bd 11592 if (sqd) {
fa84693b 11593 mutex_unlock(&sqd->lock);
41d3a6bd
JA
11594 io_put_sq_data(sqd);
11595 }
fa84693b 11596 return ret;
2e480058
JA
11597}
11598
edafccee
JA
11599static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
11600 void __user *arg, unsigned nr_args)
b19062a5
JA
11601 __releases(ctx->uring_lock)
11602 __acquires(ctx->uring_lock)
edafccee
JA
11603{
11604 int ret;
11605
35fa71a0
JA
11606 /*
11607 * We're inside the ring mutex, if the ref is already dying, then
11608 * someone else killed the ctx or is already going through
11609 * io_uring_register().
11610 */
11611 if (percpu_ref_is_dying(&ctx->refs))
11612 return -ENXIO;
11613
75c4021a
PB
11614 if (ctx->restricted) {
11615 if (opcode >= IORING_REGISTER_LAST)
11616 return -EINVAL;
11617 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
11618 if (!test_bit(opcode, ctx->restrictions.register_op))
11619 return -EACCES;
11620 }
11621
edafccee
JA
11622 switch (opcode) {
11623 case IORING_REGISTER_BUFFERS:
634d00df 11624 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
edafccee
JA
11625 break;
11626 case IORING_UNREGISTER_BUFFERS:
11627 ret = -EINVAL;
11628 if (arg || nr_args)
11629 break;
0a96bbe4 11630 ret = io_sqe_buffers_unregister(ctx);
edafccee 11631 break;
6b06314c 11632 case IORING_REGISTER_FILES:
792e3582 11633 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
6b06314c
JA
11634 break;
11635 case IORING_UNREGISTER_FILES:
11636 ret = -EINVAL;
11637 if (arg || nr_args)
11638 break;
11639 ret = io_sqe_files_unregister(ctx);
11640 break;
c3a31e60 11641 case IORING_REGISTER_FILES_UPDATE:
c3bdad02 11642 ret = io_register_files_update(ctx, arg, nr_args);
c3a31e60 11643 break;
9b402849
JA
11644 case IORING_REGISTER_EVENTFD:
11645 ret = -EINVAL;
11646 if (nr_args != 1)
11647 break;
c75312dd
UA
11648 ret = io_eventfd_register(ctx, arg, 0);
11649 break;
11650 case IORING_REGISTER_EVENTFD_ASYNC:
11651 ret = -EINVAL;
11652 if (nr_args != 1)
f2842ab5 11653 break;
c75312dd 11654 ret = io_eventfd_register(ctx, arg, 1);
9b402849
JA
11655 break;
11656 case IORING_UNREGISTER_EVENTFD:
11657 ret = -EINVAL;
11658 if (arg || nr_args)
11659 break;
11660 ret = io_eventfd_unregister(ctx);
11661 break;
66f4af93
JA
11662 case IORING_REGISTER_PROBE:
11663 ret = -EINVAL;
11664 if (!arg || nr_args > 256)
11665 break;
11666 ret = io_probe(ctx, arg, nr_args);
11667 break;
071698e1
JA
11668 case IORING_REGISTER_PERSONALITY:
11669 ret = -EINVAL;
11670 if (arg || nr_args)
11671 break;
11672 ret = io_register_personality(ctx);
11673 break;
11674 case IORING_UNREGISTER_PERSONALITY:
11675 ret = -EINVAL;
11676 if (arg)
11677 break;
11678 ret = io_unregister_personality(ctx, nr_args);
11679 break;
7e84e1c7
SG
11680 case IORING_REGISTER_ENABLE_RINGS:
11681 ret = -EINVAL;
11682 if (arg || nr_args)
11683 break;
11684 ret = io_register_enable_rings(ctx);
11685 break;
21b55dbc
SG
11686 case IORING_REGISTER_RESTRICTIONS:
11687 ret = io_register_restrictions(ctx, arg, nr_args);
11688 break;
992da01a
PB
11689 case IORING_REGISTER_FILES2:
11690 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
11691 break;
11692 case IORING_REGISTER_FILES_UPDATE2:
11693 ret = io_register_rsrc_update(ctx, arg, nr_args,
11694 IORING_RSRC_FILE);
11695 break;
11696 case IORING_REGISTER_BUFFERS2:
11697 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
792e3582 11698 break;
992da01a
PB
11699 case IORING_REGISTER_BUFFERS_UPDATE:
11700 ret = io_register_rsrc_update(ctx, arg, nr_args,
11701 IORING_RSRC_BUFFER);
c3bdad02 11702 break;
fe76421d
JA
11703 case IORING_REGISTER_IOWQ_AFF:
11704 ret = -EINVAL;
11705 if (!arg || !nr_args)
11706 break;
11707 ret = io_register_iowq_aff(ctx, arg, nr_args);
11708 break;
11709 case IORING_UNREGISTER_IOWQ_AFF:
11710 ret = -EINVAL;
11711 if (arg || nr_args)
11712 break;
11713 ret = io_unregister_iowq_aff(ctx);
11714 break;
2e480058
JA
11715 case IORING_REGISTER_IOWQ_MAX_WORKERS:
11716 ret = -EINVAL;
11717 if (!arg || nr_args != 2)
11718 break;
11719 ret = io_register_iowq_max_workers(ctx, arg);
11720 break;
e7a6c00d
JA
11721 case IORING_REGISTER_RING_FDS:
11722 ret = io_ringfd_register(ctx, arg, nr_args);
11723 break;
11724 case IORING_UNREGISTER_RING_FDS:
11725 ret = io_ringfd_unregister(ctx, arg, nr_args);
11726 break;
edafccee
JA
11727 default:
11728 ret = -EINVAL;
11729 break;
11730 }
11731
edafccee
JA
11732 return ret;
11733}
11734
11735SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
11736 void __user *, arg, unsigned int, nr_args)
11737{
11738 struct io_ring_ctx *ctx;
11739 long ret = -EBADF;
11740 struct fd f;
11741
11742 f = fdget(fd);
11743 if (!f.file)
11744 return -EBADF;
11745
11746 ret = -EOPNOTSUPP;
11747 if (f.file->f_op != &io_uring_fops)
11748 goto out_fput;
11749
11750 ctx = f.file->private_data;
11751
b6c23dd5
PB
11752 io_run_task_work();
11753
edafccee
JA
11754 mutex_lock(&ctx->uring_lock);
11755 ret = __io_uring_register(ctx, opcode, arg, nr_args);
11756 mutex_unlock(&ctx->uring_lock);
2757be22 11757 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
edafccee
JA
11758out_fput:
11759 fdput(f);
11760 return ret;
11761}
11762
2b188cc1
JA
11763static int __init io_uring_init(void)
11764{
d7f62e82
SM
11765#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11766 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11767 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11768} while (0)
11769
11770#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11771 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11772 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
11773 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
11774 BUILD_BUG_SQE_ELEM(1, __u8, flags);
11775 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
11776 BUILD_BUG_SQE_ELEM(4, __s32, fd);
11777 BUILD_BUG_SQE_ELEM(8, __u64, off);
11778 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
11779 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 11780 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
11781 BUILD_BUG_SQE_ELEM(24, __u32, len);
11782 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
11783 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
11784 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
11785 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
5769a351
JX
11786 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
11787 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
d7f62e82
SM
11788 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
11789 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
11790 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
11791 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
11792 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
11793 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
11794 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
11795 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 11796 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
11797 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
11798 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
16340eab 11799 BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
d7f62e82 11800 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 11801 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
b9445598 11802 BUILD_BUG_SQE_ELEM(44, __u32, file_index);
d7f62e82 11803
b0d658ec
PB
11804 BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11805 sizeof(struct io_uring_rsrc_update));
11806 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11807 sizeof(struct io_uring_rsrc_update2));
90499ad0
PB
11808
11809 /* ->buf_index is u16 */
11810 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11811
b0d658ec
PB
11812 /* should fit into one byte */
11813 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
68fe256a
PB
11814 BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
11815 BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
b0d658ec 11816
d3656344 11817 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
32c2d33e 11818 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
16340eab 11819
91f245d5
JA
11820 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11821 SLAB_ACCOUNT);
2b188cc1
JA
11822 return 0;
11823};
11824__initcall(io_uring_init);