nvme: enable uring-passthrough for admin commands
[linux-2.6-block.git] / fs / io_uring.c
CommitLineData
2b188cc1
JA
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
1e84b97b
SB
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
d068b506 14 * through a control-dependency in io_get_cqe (smp_store_release to
1e84b97b
SB
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
2b188cc1
JA
29 *
30 * Also see the examples in the liburing library:
31 *
32 * git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
c992fe29 40 * Copyright (c) 2018-2019 Christoph Hellwig
2b188cc1
JA
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
52de1fe1 47#include <net/compat.h>
2b188cc1
JA
48#include <linux/refcount.h>
49#include <linux/uio.h>
6b47ee6e 50#include <linux/bits.h>
2b188cc1
JA
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
2b188cc1
JA
58#include <linux/percpu.h>
59#include <linux/slab.h>
edce22e1 60#include <linux/blk-mq.h>
edafccee 61#include <linux/bvec.h>
2b188cc1
JA
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
6b06314c 65#include <net/scm.h>
2b188cc1
JA
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
edafccee
JA
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
aa4c3967 72#include <linux/highmem.h>
15b71abe
JA
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
4840e418 75#include <linux/fadvise.h>
3e4827b0 76#include <linux/eventpoll.h>
7d67af2c 77#include <linux/splice.h>
b41e9852 78#include <linux/task_work.h>
bcf5a063 79#include <linux/pagemap.h>
0f212204 80#include <linux/io_uring.h>
5bd2182d 81#include <linux/audit.h>
cdc1404a 82#include <linux/security.h>
e9621e2b 83#include <linux/xattr.h>
2b188cc1 84
c826bd7a
DD
85#define CREATE_TRACE_POINTS
86#include <trace/events/io_uring.h>
87
2b188cc1
JA
88#include <uapi/linux/io_uring.h>
89
90#include "internal.h"
561fb04a 91#include "io-wq.h"
2b188cc1 92
5277deaa 93#define IORING_MAX_ENTRIES 32768
33a107f0 94#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
4ce8ad95 95#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
65e19f54 96
187f08c1 97/* only define max */
042b0d85 98#define IORING_MAX_FIXED_FILES (1U << 15)
21b55dbc
SG
99#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
100 IORING_REGISTER_LAST + IORING_OP_LAST)
2b188cc1 101
187f08c1 102#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
2d091d62
PB
103#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
104#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
105
489809e2
PB
106#define IORING_MAX_REG_BUFFERS (1U << 14)
107
68fe256a
PB
108#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
109 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
110
5562a8d7
PB
111#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
112 IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
68fe256a 113
c854357b 114#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
d5361233 115 REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA)
b16fed66 116
a538be5b
PB
117#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
118 IO_REQ_CLEAN_FLAGS)
119
09899b19
PB
120#define IO_TCTX_REFS_CACHE_NR (1U << 10)
121
2b188cc1
JA
122struct io_uring {
123 u32 head ____cacheline_aligned_in_smp;
124 u32 tail ____cacheline_aligned_in_smp;
125};
126
1e84b97b 127/*
75b28aff
HV
128 * This data is shared with the application through the mmap at offsets
129 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
1e84b97b
SB
130 *
131 * The offsets to the member fields are published through struct
132 * io_sqring_offsets when calling io_uring_setup.
133 */
75b28aff 134struct io_rings {
1e84b97b
SB
135 /*
136 * Head and tail offsets into the ring; the offsets need to be
137 * masked to get valid indices.
138 *
75b28aff
HV
139 * The kernel controls head of the sq ring and the tail of the cq ring,
140 * and the application controls tail of the sq ring and the head of the
141 * cq ring.
1e84b97b 142 */
75b28aff 143 struct io_uring sq, cq;
1e84b97b 144 /*
75b28aff 145 * Bitmasks to apply to head and tail offsets (constant, equals
1e84b97b
SB
146 * ring_entries - 1)
147 */
75b28aff
HV
148 u32 sq_ring_mask, cq_ring_mask;
149 /* Ring sizes (constant, power of 2) */
150 u32 sq_ring_entries, cq_ring_entries;
1e84b97b
SB
151 /*
152 * Number of invalid entries dropped by the kernel due to
153 * invalid index stored in array
154 *
155 * Written by the kernel, shouldn't be modified by the
156 * application (i.e. get number of "new events" by comparing to
157 * cached value).
158 *
159 * After a new SQ head value was read by the application this
160 * counter includes all submissions that were dropped reaching
161 * the new SQ head (and possibly more).
162 */
75b28aff 163 u32 sq_dropped;
1e84b97b 164 /*
0d9b5b3a 165 * Runtime SQ flags
1e84b97b
SB
166 *
167 * Written by the kernel, shouldn't be modified by the
168 * application.
169 *
170 * The application needs a full memory barrier before checking
171 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
172 */
3a4b89a2 173 atomic_t sq_flags;
0d9b5b3a
SG
174 /*
175 * Runtime CQ flags
176 *
177 * Written by the application, shouldn't be modified by the
178 * kernel.
179 */
fe7e3257 180 u32 cq_flags;
1e84b97b
SB
181 /*
182 * Number of completion events lost because the queue was full;
183 * this should be avoided by the application by making sure
0b4295b5 184 * there are not more requests pending than there is space in
1e84b97b
SB
185 * the completion queue.
186 *
187 * Written by the kernel, shouldn't be modified by the
188 * application (i.e. get number of "new events" by comparing to
189 * cached value).
190 *
191 * As completion events come in out of order this counter is not
192 * ordered with any other data.
193 */
75b28aff 194 u32 cq_overflow;
1e84b97b
SB
195 /*
196 * Ring buffer of completion events.
197 *
198 * The kernel writes completion events fresh every time they are
199 * produced, so the application is allowed to modify pending
200 * entries.
201 */
75b28aff 202 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
2b188cc1
JA
203};
204
edafccee
JA
205struct io_mapped_ubuf {
206 u64 ubuf;
4751f53d 207 u64 ubuf_end;
edafccee 208 unsigned int nr_bvecs;
de293938 209 unsigned long acct_pages;
41edf1a5 210 struct bio_vec bvec[];
edafccee
JA
211};
212
50238531
BM
213struct io_ring_ctx;
214
6c2450ae 215struct io_overflow_cqe {
6c2450ae 216 struct list_head list;
e45a3e05 217 struct io_uring_cqe cqe;
6c2450ae
PB
218};
219
5e45690a
JA
220/*
221 * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
222 * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
223 * can't safely always dereference the file when the task has exited and ring
224 * cleanup is done. If a file is tracked and part of SCM, then unix gc on
225 * process exit may reap it before __io_sqe_files_unregister() is run.
226 */
227#define FFS_NOWAIT 0x1UL
228#define FFS_ISREG 0x2UL
229#if defined(CONFIG_64BIT)
230#define FFS_SCM 0x4UL
231#else
232#define IO_URING_SCM_ALL
233#define FFS_SCM 0x0UL
234#endif
235#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
236
a04b0ac0
PB
237struct io_fixed_file {
238 /* file * with additional FFS_* flags */
239 unsigned long file_ptr;
240};
241
269bbe5f
BM
242struct io_rsrc_put {
243 struct list_head list;
b60c8dce 244 u64 tag;
50238531
BM
245 union {
246 void *rsrc;
247 struct file *file;
bd54b6fe 248 struct io_mapped_ubuf *buf;
50238531 249 };
269bbe5f
BM
250};
251
aeca241b 252struct io_file_table {
042b0d85 253 struct io_fixed_file *files;
31b51510
JA
254};
255
b895c9a6 256struct io_rsrc_node {
05589553
XW
257 struct percpu_ref refs;
258 struct list_head node;
269bbe5f 259 struct list_head rsrc_list;
b895c9a6 260 struct io_rsrc_data *rsrc_data;
4a38aed2 261 struct llist_node llist;
e297822b 262 bool done;
05589553
XW
263};
264
40ae0ff7
PB
265typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
266
b895c9a6 267struct io_rsrc_data {
05f3fb3c
JA
268 struct io_ring_ctx *ctx;
269
2d091d62
PB
270 u64 **tags;
271 unsigned int nr;
40ae0ff7 272 rsrc_put_fn *do_put;
3e942498 273 atomic_t refs;
05f3fb3c 274 struct completion done;
8bad28d8 275 bool quiesce;
05f3fb3c
JA
276};
277
dbc7d452 278struct io_buffer_list {
dbc7d452
JA
279 struct list_head buf_list;
280 __u16 bgid;
281};
282
5a2e745d
JA
283struct io_buffer {
284 struct list_head list;
285 __u64 addr;
d1f82808 286 __u32 len;
5a2e745d 287 __u16 bid;
b1c62645 288 __u16 bgid;
5a2e745d
JA
289};
290
21b55dbc
SG
291struct io_restriction {
292 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
293 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
294 u8 sqe_flags_allowed;
295 u8 sqe_flags_required;
7e84e1c7 296 bool registered;
21b55dbc
SG
297};
298
37d1e2e3
JA
299enum {
300 IO_SQ_THREAD_SHOULD_STOP = 0,
301 IO_SQ_THREAD_SHOULD_PARK,
302};
303
534ca6d6
JA
304struct io_sq_data {
305 refcount_t refs;
9e138a48 306 atomic_t park_pending;
09a6f4ef 307 struct mutex lock;
69fb2131
JA
308
309 /* ctx's that are using this sqd */
310 struct list_head ctx_list;
69fb2131 311
534ca6d6
JA
312 struct task_struct *thread;
313 struct wait_queue_head wait;
08369246
XW
314
315 unsigned sq_thread_idle;
37d1e2e3
JA
316 int sq_cpu;
317 pid_t task_pid;
5c2469e0 318 pid_t task_tgid;
37d1e2e3
JA
319
320 unsigned long state;
37d1e2e3 321 struct completion exited;
534ca6d6
JA
322};
323
6dd0be1e 324#define IO_COMPL_BATCH 32
6ff119a6 325#define IO_REQ_CACHE_SIZE 32
bf019da7 326#define IO_REQ_ALLOC_BATCH 8
258b29a9 327
a1ab7b35
PB
328struct io_submit_link {
329 struct io_kiocb *head;
330 struct io_kiocb *last;
331};
332
258b29a9 333struct io_submit_state {
5a158c6b
PB
334 /* inline/task_work completion list, under ->uring_lock */
335 struct io_wq_work_node free_list;
336 /* batch completion logic */
337 struct io_wq_work_list compl_reqs;
a1ab7b35 338 struct io_submit_link link;
258b29a9 339
258b29a9 340 bool plug_started;
4b628aeb 341 bool need_plug;
3d4aeb9f 342 bool flush_cqes;
5ca7a8b3 343 unsigned short submit_nr;
5a158c6b 344 struct blk_plug plug;
258b29a9
PB
345};
346
77bc59b4
UA
347struct io_ev_fd {
348 struct eventfd_ctx *cq_ev_fd;
c75312dd 349 unsigned int eventfd_async: 1;
77bc59b4
UA
350 struct rcu_head rcu;
351};
352
9cfc7e94 353#define BGID_ARRAY 64
dbc7d452 354
2b188cc1 355struct io_ring_ctx {
b52ecf8c 356 /* const or read-mostly hot data */
2b188cc1
JA
357 struct {
358 struct percpu_ref refs;
2b188cc1 359
b52ecf8c 360 struct io_rings *rings;
2b188cc1 361 unsigned int flags;
9f010507 362 enum task_work_notify_mode notify_method;
e1d85334 363 unsigned int compat: 1;
e1d85334 364 unsigned int drain_next: 1;
21b55dbc 365 unsigned int restricted: 1;
f18ee4cf 366 unsigned int off_timeout_used: 1;
10c66904 367 unsigned int drain_active: 1;
5562a8d7 368 unsigned int drain_disabled: 1;
9aa8dfde 369 unsigned int has_evfd: 1;
773697b6 370 unsigned int syscall_iopoll: 1;
b52ecf8c 371 } ____cacheline_aligned_in_smp;
2b188cc1 372
7f1129d2 373 /* submission data */
b52ecf8c 374 struct {
0499e582
PB
375 struct mutex uring_lock;
376
75b28aff
HV
377 /*
378 * Ring buffer of indices into array of io_uring_sqe, which is
379 * mmapped by the application using the IORING_OFF_SQES offset.
380 *
381 * This indirection could e.g. be used to assign fixed
382 * io_uring_sqe entries to operations and only submit them to
383 * the queue when needed.
384 *
385 * The kernel modifies neither the indices array nor the entries
386 * array.
387 */
388 u32 *sq_array;
c7af47cf 389 struct io_uring_sqe *sq_sqes;
2b188cc1
JA
390 unsigned cached_sq_head;
391 unsigned sq_entries;
de0617e4 392 struct list_head defer_list;
7f1129d2
PB
393
394 /*
395 * Fixed resources fast path, should be accessed only under
396 * uring_lock, and updated through io_uring_register(2)
397 */
398 struct io_rsrc_node *rsrc_node;
ab409402 399 int rsrc_cached_refs;
8e29da69 400 atomic_t cancel_seq;
7f1129d2
PB
401 struct io_file_table file_table;
402 unsigned nr_user_files;
403 unsigned nr_user_bufs;
404 struct io_mapped_ubuf **user_bufs;
405
406 struct io_submit_state submit_state;
660cbfa2 407
9cfc7e94
JA
408 struct io_buffer_list *io_bl;
409 struct xarray io_bl_xa;
cc3cec83 410 struct list_head io_buffers_cache;
660cbfa2 411
5262f567 412 struct list_head timeout_list;
ef9dd637 413 struct list_head ltimeout_list;
1d7bb1d5 414 struct list_head cq_overflow_list;
4d9237e3 415 struct list_head apoll_cache;
7f1129d2
PB
416 struct xarray personalities;
417 u32 pers_next;
418 unsigned sq_thread_idle;
2b188cc1
JA
419 } ____cacheline_aligned_in_smp;
420
d0acdee2 421 /* IRQ completion list, under ->completion_lock */
c2b6c6bc 422 struct io_wq_work_list locked_free_list;
d0acdee2 423 unsigned int locked_free_nr;
3c1a2ead 424
7c30f36a 425 const struct cred *sq_creds; /* cred used for __io_sq_thread() */
534ca6d6
JA
426 struct io_sq_data *sq_data; /* if using sq thread polling */
427
90554200 428 struct wait_queue_head sqo_sq_wait;
69fb2131 429 struct list_head sqd_list;
75b28aff 430
10988a0a 431 unsigned long check_cq;
5ed7a37d 432
206aefde 433 struct {
d8da428b
PB
434 /*
435 * We cache a range of free CQEs we can use, once exhausted it
436 * should go through a slower range setup, see __io_get_cqe()
437 */
438 struct io_uring_cqe *cqe_cached;
439 struct io_uring_cqe *cqe_sentinel;
440
206aefde
JA
441 unsigned cached_cq_tail;
442 unsigned cq_entries;
77bc59b4 443 struct io_ev_fd __rcu *io_ev_fd;
0499e582
PB
444 struct wait_queue_head cq_wait;
445 unsigned cq_extra;
446 atomic_t cq_timeouts;
0499e582 447 unsigned cq_last_tm_flush;
206aefde 448 } ____cacheline_aligned_in_smp;
2b188cc1 449
2b188cc1
JA
450 struct {
451 spinlock_t completion_lock;
e94f141b 452
89850fce
JA
453 spinlock_t timeout_lock;
454
def596e9 455 /*
540e32a0 456 * ->iopoll_list is protected by the ctx->uring_lock for
def596e9
JA
457 * io_uring instances that don't use IORING_SETUP_SQPOLL.
458 * For SQPOLL, only the single threaded io_sq_thread() will
459 * manipulate the list, hence no extra locking is needed there.
460 */
5eef4e87 461 struct io_wq_work_list iopoll_list;
78076bb6
JA
462 struct hlist_head *cancel_hash;
463 unsigned cancel_hash_bits;
915b3dde 464 bool poll_multi_queue;
cc3cec83
JA
465
466 struct list_head io_buffers_comp;
2b188cc1 467 } ____cacheline_aligned_in_smp;
85faa7b8 468
21b55dbc 469 struct io_restriction restrictions;
3c1a2ead 470
b13a8918
PB
471 /* slow path rsrc auxilary data, used by update/register */
472 struct {
473 struct io_rsrc_node *rsrc_backup_node;
474 struct io_mapped_ubuf *dummy_ubuf;
475 struct io_rsrc_data *file_data;
476 struct io_rsrc_data *buf_data;
477
478 struct delayed_work rsrc_put_work;
479 struct llist_head rsrc_put_llist;
480 struct list_head rsrc_ref_list;
481 spinlock_t rsrc_ref_lock;
cc3cec83
JA
482
483 struct list_head io_buffers_pages;
b13a8918
PB
484 };
485
3c1a2ead 486 /* Keep this last, we don't need it for the fast path */
b986af7e
PB
487 struct {
488 #if defined(CONFIG_UNIX)
489 struct socket *ring_sock;
490 #endif
491 /* hashed buffered write serialization */
492 struct io_wq_hash *hash_map;
493
494 /* Only used for accounting purposes */
495 struct user_struct *user;
496 struct mm_struct *mm_account;
497
498 /* ctx exit and cancelation */
9011bf9a
PB
499 struct llist_head fallback_llist;
500 struct delayed_work fallback_work;
b986af7e
PB
501 struct work_struct exit_work;
502 struct list_head tctx_list;
503 struct completion ref_comp;
e139a1ec
PB
504 u32 iowq_limits[2];
505 bool iowq_limits_set;
b986af7e 506 };
2b188cc1
JA
507};
508
e7a6c00d
JA
509/*
510 * Arbitrary limit, can be raised if need be
511 */
512#define IO_RINGFD_REG_MAX 16
513
53e043b2
SM
514struct io_uring_task {
515 /* submission side */
09899b19 516 int cached_refs;
53e043b2
SM
517 struct xarray xa;
518 struct wait_queue_head wait;
ee53fb2b
SM
519 const struct io_ring_ctx *last;
520 struct io_wq *io_wq;
53e043b2
SM
521 struct percpu_counter inflight;
522 atomic_t in_idle;
53e043b2
SM
523
524 spinlock_t task_lock;
525 struct io_wq_work_list task_list;
4813c377 526 struct io_wq_work_list prior_task_list;
53e043b2 527 struct callback_head task_work;
e7a6c00d 528 struct file **registered_rings;
6294f368 529 bool task_running;
53e043b2
SM
530};
531
09bb8394
JA
532/*
533 * First field must be the file pointer in all the
534 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
535 */
221c5eb2
JA
536struct io_poll_iocb {
537 struct file *file;
018043be 538 struct wait_queue_head *head;
221c5eb2 539 __poll_t events;
392edb45 540 struct wait_queue_entry wait;
221c5eb2
JA
541};
542
9d805892 543struct io_poll_update {
018043be 544 struct file *file;
9d805892
PB
545 u64 old_user_data;
546 u64 new_user_data;
547 __poll_t events;
b69de288
JA
548 bool update_events;
549 bool update_user_data;
018043be
PB
550};
551
b5dba59e
JA
552struct io_close {
553 struct file *file;
b5dba59e 554 int fd;
7df778be 555 u32 file_slot;
b5dba59e
JA
556};
557
ad8a48ac
JA
558struct io_timeout_data {
559 struct io_kiocb *req;
560 struct hrtimer timer;
561 struct timespec64 ts;
562 enum hrtimer_mode mode;
50c1df2b 563 u32 flags;
ad8a48ac
JA
564};
565
8ed8d3c3
JA
566struct io_accept {
567 struct file *file;
568 struct sockaddr __user *addr;
569 int __user *addr_len;
570 int flags;
aaa4db12 571 u32 file_slot;
09952e3e 572 unsigned long nofile;
8ed8d3c3
JA
573};
574
1374e08e
JA
575struct io_socket {
576 struct file *file;
577 int domain;
578 int type;
579 int protocol;
580 int flags;
581 u32 file_slot;
582 unsigned long nofile;
583};
584
8ed8d3c3
JA
585struct io_sync {
586 struct file *file;
587 loff_t len;
588 loff_t off;
589 int flags;
d63d1b5e 590 int mode;
8ed8d3c3
JA
591};
592
fbf23849
JA
593struct io_cancel {
594 struct file *file;
595 u64 addr;
8e29da69 596 u32 flags;
4bf94615 597 s32 fd;
fbf23849
JA
598};
599
b29472ee
JA
600struct io_timeout {
601 struct file *file;
bfe68a22
PB
602 u32 off;
603 u32 target_seq;
135fcde8 604 struct list_head list;
90cd7e42
PB
605 /* head of the link, used by linked timeouts only */
606 struct io_kiocb *head;
89b263f6
JA
607 /* for linked completions */
608 struct io_kiocb *prev;
b29472ee
JA
609};
610
0bdf7a2d
PB
611struct io_timeout_rem {
612 struct file *file;
613 u64 addr;
9c8e11b3
PB
614
615 /* timeout update */
616 struct timespec64 ts;
617 u32 flags;
f1042b6c 618 bool ltimeout;
0bdf7a2d
PB
619};
620
9adbd45d
JA
621struct io_rw {
622 /* NOTE: kiocb has the file as the first member, so don't do it here */
623 struct kiocb kiocb;
624 u64 addr;
584b0180
JA
625 u32 len;
626 u32 flags;
9adbd45d
JA
627};
628
3fbb51c1
JA
629struct io_connect {
630 struct file *file;
631 struct sockaddr __user *addr;
632 int addr_len;
633};
634
e47293fd
JA
635struct io_sr_msg {
636 struct file *file;
fddaface 637 union {
4af3417a
PB
638 struct compat_msghdr __user *umsg_compat;
639 struct user_msghdr __user *umsg;
640 void __user *buf;
fddaface 641 };
e47293fd 642 int msg_flags;
fddaface 643 size_t len;
7ba89d2a 644 size_t done_io;
0455d4cc 645 unsigned int flags;
e47293fd
JA
646};
647
15b71abe
JA
648struct io_open {
649 struct file *file;
650 int dfd;
b9445598 651 u32 file_slot;
15b71abe 652 struct filename *filename;
c12cedf2 653 struct open_how how;
4022e7af 654 unsigned long nofile;
15b71abe
JA
655};
656
269bbe5f 657struct io_rsrc_update {
05f3fb3c
JA
658 struct file *file;
659 u64 arg;
660 u32 nr_args;
661 u32 offset;
662};
663
4840e418
JA
664struct io_fadvise {
665 struct file *file;
666 u64 offset;
667 u32 len;
668 u32 advice;
669};
670
c1ca757b
JA
671struct io_madvise {
672 struct file *file;
673 u64 addr;
674 u32 len;
675 u32 advice;
676};
677
3e4827b0
JA
678struct io_epoll {
679 struct file *file;
680 int epfd;
681 int op;
682 int fd;
683 struct epoll_event event;
e47293fd
JA
684};
685
7d67af2c
PB
686struct io_splice {
687 struct file *file_out;
7d67af2c
PB
688 loff_t off_out;
689 loff_t off_in;
690 u64 len;
a3e4bc23 691 int splice_fd_in;
7d67af2c
PB
692 unsigned int flags;
693};
694
ddf0322d
JA
695struct io_provide_buf {
696 struct file *file;
697 __u64 addr;
38134ada 698 __u32 len;
ddf0322d
JA
699 __u32 bgid;
700 __u16 nbufs;
701 __u16 bid;
702};
703
1d9e1288
BM
704struct io_statx {
705 struct file *file;
706 int dfd;
707 unsigned int mask;
708 unsigned int flags;
1b6fe6e0 709 struct filename *filename;
1d9e1288
BM
710 struct statx __user *buffer;
711};
712
36f4fa68
JA
713struct io_shutdown {
714 struct file *file;
715 int how;
716};
717
80a261fd
JA
718struct io_rename {
719 struct file *file;
720 int old_dfd;
721 int new_dfd;
722 struct filename *oldpath;
723 struct filename *newpath;
724 int flags;
725};
726
14a1143b
JA
727struct io_unlink {
728 struct file *file;
729 int dfd;
730 int flags;
731 struct filename *filename;
732};
733
e34a02dc
DK
734struct io_mkdir {
735 struct file *file;
736 int dfd;
737 umode_t mode;
738 struct filename *filename;
739};
740
7a8721f8
DK
741struct io_symlink {
742 struct file *file;
743 int new_dfd;
744 struct filename *oldpath;
745 struct filename *newpath;
746};
747
cf30da90
DK
748struct io_hardlink {
749 struct file *file;
750 int old_dfd;
751 int new_dfd;
752 struct filename *oldpath;
753 struct filename *newpath;
754 int flags;
755};
756
4f57f06c
JA
757struct io_msg {
758 struct file *file;
759 u64 user_data;
760 u32 len;
761};
762
2bb04df7
SR
763struct io_nop {
764 struct file *file;
765 u64 extra1;
766 u64 extra2;
767};
768
f499a021
JA
769struct io_async_connect {
770 struct sockaddr_storage address;
771};
772
03b1230c
JA
773struct io_async_msghdr {
774 struct iovec fast_iov[UIO_FASTIOV];
257e84a5
PB
775 /* points to an allocated iov, if NULL we use fast_iov instead */
776 struct iovec *free_iov;
03b1230c
JA
777 struct sockaddr __user *uaddr;
778 struct msghdr msg;
b537916c 779 struct sockaddr_storage addr;
03b1230c
JA
780};
781
538941e2 782struct io_rw_state {
ff6165b2 783 struct iov_iter iter;
cd658695 784 struct iov_iter_state iter_state;
c88598a9 785 struct iovec fast_iov[UIO_FASTIOV];
538941e2
PB
786};
787
788struct io_async_rw {
789 struct io_rw_state s;
790 const struct iovec *free_iovec;
227c0c96 791 size_t bytes_done;
bcf5a063 792 struct wait_page_queue wpq;
f67676d1
JA
793};
794
e9621e2b
SR
795struct io_xattr {
796 struct file *file;
797 struct xattr_ctx ctx;
798 struct filename *filename;
799};
800
6b47ee6e
PB
801enum {
802 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
803 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
804 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
805 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
806 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
bcda7baa 807 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
04c76b41 808 REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
6b47ee6e 809
dddca226 810 /* first byte is taken by user flags, shift it to not overlap */
93d2bcd2 811 REQ_F_FAIL_BIT = 8,
6b47ee6e
PB
812 REQ_F_INFLIGHT_BIT,
813 REQ_F_CUR_POS_BIT,
814 REQ_F_NOWAIT_BIT,
6b47ee6e 815 REQ_F_LINK_TIMEOUT_BIT,
99bc4c38 816 REQ_F_NEED_CLEANUP_BIT,
d7718a9d 817 REQ_F_POLLED_BIT,
bcda7baa 818 REQ_F_BUFFER_SELECTED_BIT,
e342c807 819 REQ_F_COMPLETE_INLINE_BIT,
230d50d4 820 REQ_F_REISSUE_BIT,
b8e64b53 821 REQ_F_CREDS_BIT,
20e60a38 822 REQ_F_REFCOUNT_BIT,
4d13d1a4 823 REQ_F_ARM_LTIMEOUT_BIT,
d886e185 824 REQ_F_ASYNC_DATA_BIT,
04c76b41 825 REQ_F_SKIP_LINK_CQES_BIT,
91eac1c6
JA
826 REQ_F_SINGLE_POLL_BIT,
827 REQ_F_DOUBLE_POLL_BIT,
8a3e8ee5 828 REQ_F_PARTIAL_IO_BIT,
7b29f92d 829 /* keep async read/write and isreg together and in order */
35645ac3 830 REQ_F_SUPPORT_NOWAIT_BIT,
7b29f92d 831 REQ_F_ISREG_BIT,
84557871
JA
832
833 /* not a real bit, just to check we're not overflowing the space */
834 __REQ_F_LAST_BIT,
6b47ee6e
PB
835};
836
837enum {
838 /* ctx owns file */
839 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
840 /* drain existing IO first */
841 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
842 /* linked sqes */
843 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
844 /* doesn't sever on completion < 0 */
845 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
846 /* IOSQE_ASYNC */
847 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
bcda7baa
JA
848 /* IOSQE_BUFFER_SELECT */
849 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
04c76b41
PB
850 /* IOSQE_CQE_SKIP_SUCCESS */
851 REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
6b47ee6e 852
6b47ee6e 853 /* fail rest of links */
93d2bcd2 854 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
b05a1bcd 855 /* on inflight list, should be cancelled and waited on exit reliably */
6b47ee6e
PB
856 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
857 /* read/write uses file position */
858 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
859 /* must not punt to workers */
860 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
900fad45 861 /* has or had linked timeout */
6b47ee6e 862 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
99bc4c38
PB
863 /* needs cleanup */
864 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
d7718a9d
JA
865 /* already went through poll handler */
866 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
bcda7baa
JA
867 /* buffer already selected */
868 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
e342c807
PB
869 /* completion is deferred through io_comp_state */
870 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
230d50d4
JA
871 /* caller should reissue async */
872 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
35645ac3
PB
873 /* supports async reads/writes */
874 REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
7b29f92d
JA
875 /* regular file */
876 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
b8e64b53
PB
877 /* has creds assigned */
878 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
20e60a38
PB
879 /* skip refcounting if not set */
880 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
4d13d1a4
PB
881 /* there is a linked timeout that has to be armed */
882 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
d886e185
PB
883 /* ->async_data allocated */
884 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
04c76b41
PB
885 /* don't post CQEs while failing linked requests */
886 REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
91eac1c6
JA
887 /* single poll may be active */
888 REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
889 /* double poll may active */
890 REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
8a3e8ee5
JA
891 /* request has already done partial IO */
892 REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
d7718a9d
JA
893};
894
895struct async_poll {
896 struct io_poll_iocb poll;
807abcb0 897 struct io_poll_iocb *double_poll;
6b47ee6e
PB
898};
899
f237c30a 900typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
5b0a6acc 901
7cbf1722 902struct io_task_work {
5b0a6acc
PB
903 union {
904 struct io_wq_work_node node;
905 struct llist_node fallback_node;
906 };
907 io_req_tw_func_t func;
7cbf1722
JA
908};
909
992da01a
PB
910enum {
911 IORING_RSRC_FILE = 0,
912 IORING_RSRC_BUFFER = 1,
913};
914
cef216fc
PB
915struct io_cqe {
916 __u64 user_data;
917 __s32 res;
918 /* fd initially, then cflags for completion */
919 union {
920 __u32 flags;
921 int fd;
922 };
923};
924
10988a0a
DY
925enum {
926 IO_CHECK_CQ_OVERFLOW_BIT,
155bc950 927 IO_CHECK_CQ_DROPPED_BIT,
10988a0a
DY
928};
929
09bb8394
JA
930/*
931 * NOTE! Each of the iocb union members has the file pointer
932 * as the first entry in their struct definition. So you can
933 * access the file pointer through any of the sub-structs,
63c36549 934 * or directly as just 'file' in this struct.
09bb8394 935 */
2b188cc1 936struct io_kiocb {
221c5eb2 937 union {
09bb8394 938 struct file *file;
9adbd45d 939 struct io_rw rw;
221c5eb2 940 struct io_poll_iocb poll;
9d805892 941 struct io_poll_update poll_update;
8ed8d3c3
JA
942 struct io_accept accept;
943 struct io_sync sync;
fbf23849 944 struct io_cancel cancel;
b29472ee 945 struct io_timeout timeout;
0bdf7a2d 946 struct io_timeout_rem timeout_rem;
3fbb51c1 947 struct io_connect connect;
e47293fd 948 struct io_sr_msg sr_msg;
15b71abe 949 struct io_open open;
b5dba59e 950 struct io_close close;
269bbe5f 951 struct io_rsrc_update rsrc_update;
4840e418 952 struct io_fadvise fadvise;
c1ca757b 953 struct io_madvise madvise;
3e4827b0 954 struct io_epoll epoll;
7d67af2c 955 struct io_splice splice;
ddf0322d 956 struct io_provide_buf pbuf;
1d9e1288 957 struct io_statx statx;
36f4fa68 958 struct io_shutdown shutdown;
80a261fd 959 struct io_rename rename;
14a1143b 960 struct io_unlink unlink;
e34a02dc 961 struct io_mkdir mkdir;
7a8721f8 962 struct io_symlink symlink;
cf30da90 963 struct io_hardlink hardlink;
4f57f06c 964 struct io_msg msg;
e9621e2b 965 struct io_xattr xattr;
1374e08e 966 struct io_socket sock;
2bb04df7 967 struct io_nop nop;
ee692a21 968 struct io_uring_cmd uring_cmd;
221c5eb2 969 };
2b188cc1 970
d625c6ee 971 u8 opcode;
65a6543d
XW
972 /* polled IO has completed */
973 u8 iopoll_completed;
1dbd023e
JA
974 /*
975 * Can be either a fixed buffer index, or used with provided buffers.
976 * For the latter, before issue it points to the buffer group ID,
977 * and after selection it points to the buffer ID itself.
978 */
4f4eeba8 979 u16 buf_index;
d17e56eb
PB
980 unsigned int flags;
981
cef216fc 982 struct io_cqe cqe;
4f4eeba8 983
010e8e6b 984 struct io_ring_ctx *ctx;
010e8e6b 985 struct task_struct *task;
d7718a9d 986
c1bdf8ed 987 struct io_rsrc_node *rsrc_node;
a4f8d94c
JA
988
989 union {
990 /* store used ubuf, so we can prevent reloading */
991 struct io_mapped_ubuf *imu;
992
993 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
994 struct io_buffer *kbuf;
995 };
fcb323cc 996
2804ecd8
JA
997 union {
998 /* used by request caches, completion batching and iopoll */
999 struct io_wq_work_node comp_list;
1000 /* cache ->apoll->events */
1001 int apoll_events;
1002 };
d17e56eb 1003 atomic_t refs;
521d61fc 1004 atomic_t poll_refs;
5b0a6acc 1005 struct io_task_work io_task_work;
010e8e6b 1006 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
4e5bc0a9
SR
1007 union {
1008 struct hlist_node hash_node;
1009 struct {
1010 u64 extra1;
1011 u64 extra2;
1012 };
1013 };
7e3709d5 1014 /* internal polling, see IORING_FEAT_FAST_POLL */
010e8e6b 1015 struct async_poll *apoll;
d886e185
PB
1016 /* opcode allocated if it needs to store data for async defer */
1017 void *async_data;
41cdcc22 1018 /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
34d2bfe7 1019 struct io_kiocb *link;
41cdcc22 1020 /* custom credentials, valid IFF REQ_F_CREDS is set */
521d61fc
JA
1021 const struct cred *creds;
1022 struct io_wq_work work;
2b188cc1 1023};
05589553 1024
13bf43f5
PB
1025struct io_tctx_node {
1026 struct list_head ctx_node;
1027 struct task_struct *task;
13bf43f5
PB
1028 struct io_ring_ctx *ctx;
1029};
1030
27dc8338
PB
1031struct io_defer_entry {
1032 struct list_head list;
1033 struct io_kiocb *req;
9cf7c104 1034 u32 seq;
2b188cc1
JA
1035};
1036
b21432b4
JA
1037struct io_cancel_data {
1038 struct io_ring_ctx *ctx;
4bf94615
JA
1039 union {
1040 u64 data;
1041 struct file *file;
1042 };
8e29da69
JA
1043 u32 flags;
1044 int seq;
b21432b4
JA
1045};
1046
ee692a21
JA
1047/*
1048 * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
1049 * the following sqe if SQE128 is used.
1050 */
1051#define uring_cmd_pdu_size(is_sqe128) \
1052 ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
1053 offsetof(struct io_uring_sqe, cmd))
1054
d3656344 1055struct io_op_def {
d3656344
JA
1056 /* needs req->file assigned */
1057 unsigned needs_file : 1;
6d63416d
PB
1058 /* should block plug */
1059 unsigned plug : 1;
d3656344
JA
1060 /* hash wq insertion if file is a regular file */
1061 unsigned hash_reg_file : 1;
1062 /* unbound wq insertion if file is a non-regular file */
1063 unsigned unbound_nonreg_file : 1;
8a72758c
JA
1064 /* set if opcode supports polled "wait" */
1065 unsigned pollin : 1;
1066 unsigned pollout : 1;
52dd8640 1067 unsigned poll_exclusive : 1;
bcda7baa
JA
1068 /* op supports buffer selection */
1069 unsigned buffer_select : 1;
26f0505a
PB
1070 /* do prep async if is going to be punted */
1071 unsigned needs_async_setup : 1;
6d63416d
PB
1072 /* opcode is not supported by this kernel */
1073 unsigned not_supported : 1;
5bd2182d
PM
1074 /* skip auditing */
1075 unsigned audit_skip : 1;
73911426
JA
1076 /* supports ioprio */
1077 unsigned ioprio : 1;
1078 /* supports iopoll */
1079 unsigned iopoll : 1;
e8c2bc1f
JA
1080 /* size of async data needed, if any */
1081 unsigned short async_size;
d3656344
JA
1082};
1083
0918682b 1084static const struct io_op_def io_op_defs[] = {
73911426
JA
1085 [IORING_OP_NOP] = {
1086 .audit_skip = 1,
1087 .iopoll = 1,
1088 },
0463b6c5 1089 [IORING_OP_READV] = {
d3656344
JA
1090 .needs_file = 1,
1091 .unbound_nonreg_file = 1,
8a72758c 1092 .pollin = 1,
4d954c25 1093 .buffer_select = 1,
26f0505a 1094 .needs_async_setup = 1,
27926b68 1095 .plug = 1,
5bd2182d 1096 .audit_skip = 1,
73911426
JA
1097 .ioprio = 1,
1098 .iopoll = 1,
e8c2bc1f 1099 .async_size = sizeof(struct io_async_rw),
d3656344 1100 },
0463b6c5 1101 [IORING_OP_WRITEV] = {
d3656344
JA
1102 .needs_file = 1,
1103 .hash_reg_file = 1,
1104 .unbound_nonreg_file = 1,
8a72758c 1105 .pollout = 1,
26f0505a 1106 .needs_async_setup = 1,
27926b68 1107 .plug = 1,
5bd2182d 1108 .audit_skip = 1,
73911426
JA
1109 .ioprio = 1,
1110 .iopoll = 1,
e8c2bc1f 1111 .async_size = sizeof(struct io_async_rw),
d3656344 1112 },
0463b6c5 1113 [IORING_OP_FSYNC] = {
d3656344 1114 .needs_file = 1,
5bd2182d 1115 .audit_skip = 1,
d3656344 1116 },
0463b6c5 1117 [IORING_OP_READ_FIXED] = {
d3656344
JA
1118 .needs_file = 1,
1119 .unbound_nonreg_file = 1,
8a72758c 1120 .pollin = 1,
27926b68 1121 .plug = 1,
5bd2182d 1122 .audit_skip = 1,
73911426
JA
1123 .ioprio = 1,
1124 .iopoll = 1,
e8c2bc1f 1125 .async_size = sizeof(struct io_async_rw),
d3656344 1126 },
0463b6c5 1127 [IORING_OP_WRITE_FIXED] = {
d3656344
JA
1128 .needs_file = 1,
1129 .hash_reg_file = 1,
1130 .unbound_nonreg_file = 1,
8a72758c 1131 .pollout = 1,
27926b68 1132 .plug = 1,
5bd2182d 1133 .audit_skip = 1,
73911426
JA
1134 .ioprio = 1,
1135 .iopoll = 1,
e8c2bc1f 1136 .async_size = sizeof(struct io_async_rw),
d3656344 1137 },
0463b6c5 1138 [IORING_OP_POLL_ADD] = {
d3656344
JA
1139 .needs_file = 1,
1140 .unbound_nonreg_file = 1,
5bd2182d
PM
1141 .audit_skip = 1,
1142 },
1143 [IORING_OP_POLL_REMOVE] = {
1144 .audit_skip = 1,
d3656344 1145 },
0463b6c5 1146 [IORING_OP_SYNC_FILE_RANGE] = {
d3656344 1147 .needs_file = 1,
5bd2182d 1148 .audit_skip = 1,
d3656344 1149 },
0463b6c5 1150 [IORING_OP_SENDMSG] = {
d3656344
JA
1151 .needs_file = 1,
1152 .unbound_nonreg_file = 1,
8a72758c 1153 .pollout = 1,
26f0505a 1154 .needs_async_setup = 1,
e8c2bc1f 1155 .async_size = sizeof(struct io_async_msghdr),
d3656344 1156 },
0463b6c5 1157 [IORING_OP_RECVMSG] = {
d3656344
JA
1158 .needs_file = 1,
1159 .unbound_nonreg_file = 1,
8a72758c 1160 .pollin = 1,
52de1fe1 1161 .buffer_select = 1,
26f0505a 1162 .needs_async_setup = 1,
e8c2bc1f 1163 .async_size = sizeof(struct io_async_msghdr),
d3656344 1164 },
0463b6c5 1165 [IORING_OP_TIMEOUT] = {
5bd2182d 1166 .audit_skip = 1,
e8c2bc1f 1167 .async_size = sizeof(struct io_timeout_data),
d3656344 1168 },
9c8e11b3
PB
1169 [IORING_OP_TIMEOUT_REMOVE] = {
1170 /* used by timeout updates' prep() */
5bd2182d 1171 .audit_skip = 1,
9c8e11b3 1172 },
0463b6c5 1173 [IORING_OP_ACCEPT] = {
d3656344
JA
1174 .needs_file = 1,
1175 .unbound_nonreg_file = 1,
8a72758c 1176 .pollin = 1,
52dd8640 1177 .poll_exclusive = 1,
d3656344 1178 },
5bd2182d
PM
1179 [IORING_OP_ASYNC_CANCEL] = {
1180 .audit_skip = 1,
1181 },
0463b6c5 1182 [IORING_OP_LINK_TIMEOUT] = {
5bd2182d 1183 .audit_skip = 1,
e8c2bc1f 1184 .async_size = sizeof(struct io_timeout_data),
d3656344 1185 },
0463b6c5 1186 [IORING_OP_CONNECT] = {
d3656344
JA
1187 .needs_file = 1,
1188 .unbound_nonreg_file = 1,
8a72758c 1189 .pollout = 1,
26f0505a 1190 .needs_async_setup = 1,
e8c2bc1f 1191 .async_size = sizeof(struct io_async_connect),
d3656344 1192 },
0463b6c5 1193 [IORING_OP_FALLOCATE] = {
d3656344 1194 .needs_file = 1,
d3656344 1195 },
44526bed
JA
1196 [IORING_OP_OPENAT] = {},
1197 [IORING_OP_CLOSE] = {},
5bd2182d
PM
1198 [IORING_OP_FILES_UPDATE] = {
1199 .audit_skip = 1,
73911426 1200 .iopoll = 1,
5bd2182d
PM
1201 },
1202 [IORING_OP_STATX] = {
1203 .audit_skip = 1,
1204 },
0463b6c5 1205 [IORING_OP_READ] = {
3a6820f2
JA
1206 .needs_file = 1,
1207 .unbound_nonreg_file = 1,
8a72758c 1208 .pollin = 1,
bcda7baa 1209 .buffer_select = 1,
27926b68 1210 .plug = 1,
5bd2182d 1211 .audit_skip = 1,
73911426
JA
1212 .ioprio = 1,
1213 .iopoll = 1,
e8c2bc1f 1214 .async_size = sizeof(struct io_async_rw),
3a6820f2 1215 },
0463b6c5 1216 [IORING_OP_WRITE] = {
3a6820f2 1217 .needs_file = 1,
7b3188e7 1218 .hash_reg_file = 1,
3a6820f2 1219 .unbound_nonreg_file = 1,
8a72758c 1220 .pollout = 1,
27926b68 1221 .plug = 1,
5bd2182d 1222 .audit_skip = 1,
73911426
JA
1223 .ioprio = 1,
1224 .iopoll = 1,
e8c2bc1f 1225 .async_size = sizeof(struct io_async_rw),
3a6820f2 1226 },
0463b6c5 1227 [IORING_OP_FADVISE] = {
4840e418 1228 .needs_file = 1,
5bd2182d 1229 .audit_skip = 1,
c1ca757b 1230 },
44526bed 1231 [IORING_OP_MADVISE] = {},
0463b6c5 1232 [IORING_OP_SEND] = {
fddaface
JA
1233 .needs_file = 1,
1234 .unbound_nonreg_file = 1,
8a72758c 1235 .pollout = 1,
5bd2182d 1236 .audit_skip = 1,
fddaface 1237 },
0463b6c5 1238 [IORING_OP_RECV] = {
fddaface
JA
1239 .needs_file = 1,
1240 .unbound_nonreg_file = 1,
8a72758c 1241 .pollin = 1,
bcda7baa 1242 .buffer_select = 1,
5bd2182d 1243 .audit_skip = 1,
fddaface 1244 },
0463b6c5 1245 [IORING_OP_OPENAT2] = {
cebdb986 1246 },
3e4827b0
JA
1247 [IORING_OP_EPOLL_CTL] = {
1248 .unbound_nonreg_file = 1,
5bd2182d 1249 .audit_skip = 1,
3e4827b0 1250 },
7d67af2c
PB
1251 [IORING_OP_SPLICE] = {
1252 .needs_file = 1,
1253 .hash_reg_file = 1,
1254 .unbound_nonreg_file = 1,
5bd2182d
PM
1255 .audit_skip = 1,
1256 },
1257 [IORING_OP_PROVIDE_BUFFERS] = {
1258 .audit_skip = 1,
73911426 1259 .iopoll = 1,
5bd2182d
PM
1260 },
1261 [IORING_OP_REMOVE_BUFFERS] = {
1262 .audit_skip = 1,
73911426 1263 .iopoll = 1,
ddf0322d 1264 },
f2a8d5c7
PB
1265 [IORING_OP_TEE] = {
1266 .needs_file = 1,
1267 .hash_reg_file = 1,
1268 .unbound_nonreg_file = 1,
5bd2182d 1269 .audit_skip = 1,
f2a8d5c7 1270 },
36f4fa68
JA
1271 [IORING_OP_SHUTDOWN] = {
1272 .needs_file = 1,
1273 },
44526bed
JA
1274 [IORING_OP_RENAMEAT] = {},
1275 [IORING_OP_UNLINKAT] = {},
e34a02dc 1276 [IORING_OP_MKDIRAT] = {},
7a8721f8 1277 [IORING_OP_SYMLINKAT] = {},
cf30da90 1278 [IORING_OP_LINKAT] = {},
4f57f06c
JA
1279 [IORING_OP_MSG_RING] = {
1280 .needs_file = 1,
73911426 1281 .iopoll = 1,
4f57f06c 1282 },
e9621e2b
SR
1283 [IORING_OP_FSETXATTR] = {
1284 .needs_file = 1
1285 },
1286 [IORING_OP_SETXATTR] = {},
a56834e0
SR
1287 [IORING_OP_FGETXATTR] = {
1288 .needs_file = 1
1289 },
1290 [IORING_OP_GETXATTR] = {},
1374e08e
JA
1291 [IORING_OP_SOCKET] = {
1292 .audit_skip = 1,
1293 },
ee692a21
JA
1294 [IORING_OP_URING_CMD] = {
1295 .needs_file = 1,
1296 .plug = 1,
1297 .needs_async_setup = 1,
1298 .async_size = uring_cmd_pdu_size(1),
1299 },
d3656344
JA
1300};
1301
0756a869
PB
1302/* requests with any of those set should undergo io_disarm_next() */
1303#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
da1a08c5 1304#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
0756a869 1305
7a612350 1306static bool io_disarm_next(struct io_kiocb *req);
eef51daa 1307static void io_uring_del_tctx_node(unsigned long index);
9936c7c2
PB
1308static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1309 struct task_struct *task,
3dd0c97a 1310 bool cancel_all);
78cc687b 1311static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1ffc5422 1312
4e118cd9 1313static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags);
c7dae4ba 1314static void io_dismantle_req(struct io_kiocb *req);
94ae5e77 1315static void io_queue_linked_timeout(struct io_kiocb *req);
fdecb662 1316static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 1317 struct io_uring_rsrc_update2 *up,
98f0b3b4 1318 unsigned nr_args);
68fb8979 1319static void io_clean_op(struct io_kiocb *req);
5106dd6e
JA
1320static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
1321 unsigned issue_flags);
1322static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
d5361233
JA
1323static void io_drop_inflight_file(struct io_kiocb *req);
1324static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
cbc2e203 1325static void io_queue_sqe(struct io_kiocb *req);
269bbe5f 1326static void io_rsrc_put_work(struct work_struct *work);
de0617e4 1327
907d1df3 1328static void io_req_task_queue(struct io_kiocb *req);
c450178d 1329static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
179ae0d1 1330static int io_req_prep_async(struct io_kiocb *req);
de0617e4 1331
b9445598
PB
1332static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1333 unsigned int issue_flags, u32 slot_index);
7df778be
PB
1334static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1335
f1042b6c 1336static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
9aa8dfde 1337static void io_eventfd_signal(struct io_ring_ctx *ctx);
4e118cd9 1338static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
b9445598 1339
2b188cc1
JA
1340static struct kmem_cache *req_cachep;
1341
0918682b 1342static const struct file_operations io_uring_fops;
2b188cc1 1343
33337d03
DY
1344const char *io_uring_get_opcode(u8 opcode)
1345{
1346 switch ((enum io_uring_op)opcode) {
1347 case IORING_OP_NOP:
1348 return "NOP";
1349 case IORING_OP_READV:
1350 return "READV";
1351 case IORING_OP_WRITEV:
1352 return "WRITEV";
1353 case IORING_OP_FSYNC:
1354 return "FSYNC";
1355 case IORING_OP_READ_FIXED:
1356 return "READ_FIXED";
1357 case IORING_OP_WRITE_FIXED:
1358 return "WRITE_FIXED";
1359 case IORING_OP_POLL_ADD:
1360 return "POLL_ADD";
1361 case IORING_OP_POLL_REMOVE:
1362 return "POLL_REMOVE";
1363 case IORING_OP_SYNC_FILE_RANGE:
1364 return "SYNC_FILE_RANGE";
1365 case IORING_OP_SENDMSG:
1366 return "SENDMSG";
1367 case IORING_OP_RECVMSG:
1368 return "RECVMSG";
1369 case IORING_OP_TIMEOUT:
1370 return "TIMEOUT";
1371 case IORING_OP_TIMEOUT_REMOVE:
1372 return "TIMEOUT_REMOVE";
1373 case IORING_OP_ACCEPT:
1374 return "ACCEPT";
1375 case IORING_OP_ASYNC_CANCEL:
1376 return "ASYNC_CANCEL";
1377 case IORING_OP_LINK_TIMEOUT:
1378 return "LINK_TIMEOUT";
1379 case IORING_OP_CONNECT:
1380 return "CONNECT";
1381 case IORING_OP_FALLOCATE:
1382 return "FALLOCATE";
1383 case IORING_OP_OPENAT:
1384 return "OPENAT";
1385 case IORING_OP_CLOSE:
1386 return "CLOSE";
1387 case IORING_OP_FILES_UPDATE:
1388 return "FILES_UPDATE";
1389 case IORING_OP_STATX:
1390 return "STATX";
1391 case IORING_OP_READ:
1392 return "READ";
1393 case IORING_OP_WRITE:
1394 return "WRITE";
1395 case IORING_OP_FADVISE:
1396 return "FADVISE";
1397 case IORING_OP_MADVISE:
1398 return "MADVISE";
1399 case IORING_OP_SEND:
1400 return "SEND";
1401 case IORING_OP_RECV:
1402 return "RECV";
1403 case IORING_OP_OPENAT2:
1404 return "OPENAT2";
1405 case IORING_OP_EPOLL_CTL:
1406 return "EPOLL_CTL";
1407 case IORING_OP_SPLICE:
1408 return "SPLICE";
1409 case IORING_OP_PROVIDE_BUFFERS:
1410 return "PROVIDE_BUFFERS";
1411 case IORING_OP_REMOVE_BUFFERS:
1412 return "REMOVE_BUFFERS";
1413 case IORING_OP_TEE:
1414 return "TEE";
1415 case IORING_OP_SHUTDOWN:
1416 return "SHUTDOWN";
1417 case IORING_OP_RENAMEAT:
1418 return "RENAMEAT";
1419 case IORING_OP_UNLINKAT:
1420 return "UNLINKAT";
1421 case IORING_OP_MKDIRAT:
1422 return "MKDIRAT";
1423 case IORING_OP_SYMLINKAT:
1424 return "SYMLINKAT";
1425 case IORING_OP_LINKAT:
1426 return "LINKAT";
1427 case IORING_OP_MSG_RING:
1428 return "MSG_RING";
1429 case IORING_OP_FSETXATTR:
1430 return "FSETXATTR";
1431 case IORING_OP_SETXATTR:
1432 return "SETXATTR";
1433 case IORING_OP_FGETXATTR:
1434 return "FGETXATTR";
1435 case IORING_OP_GETXATTR:
1436 return "GETXATTR";
1437 case IORING_OP_SOCKET:
1438 return "SOCKET";
ee692a21
JA
1439 case IORING_OP_URING_CMD:
1440 return "URING_CMD";
33337d03
DY
1441 case IORING_OP_LAST:
1442 return "INVALID";
1443 }
1444 return "INVALID";
1445}
1446
2b188cc1
JA
1447struct sock *io_uring_get_socket(struct file *file)
1448{
1449#if defined(CONFIG_UNIX)
1450 if (file->f_op == &io_uring_fops) {
1451 struct io_ring_ctx *ctx = file->private_data;
1452
1453 return ctx->ring_sock->sk;
1454 }
1455#endif
1456 return NULL;
1457}
1458EXPORT_SYMBOL(io_uring_get_socket);
1459
1f59bc0f
PB
1460#if defined(CONFIG_UNIX)
1461static inline bool io_file_need_scm(struct file *filp)
1462{
5e45690a
JA
1463#if defined(IO_URING_SCM_ALL)
1464 return true;
1465#else
1f59bc0f 1466 return !!unix_get_socket(filp);
5e45690a 1467#endif
1f59bc0f
PB
1468}
1469#else
1470static inline bool io_file_need_scm(struct file *filp)
1471{
5e45690a 1472 return false;
1f59bc0f
PB
1473}
1474#endif
1475
f8929630
PB
1476static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags)
1477{
1478 lockdep_assert_held(&ctx->uring_lock);
1479 if (issue_flags & IO_URING_F_UNLOCKED)
1480 mutex_unlock(&ctx->uring_lock);
1481}
1482
1483static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags)
1484{
1485 /*
1486 * "Normal" inline submissions always hold the uring_lock, since we
1487 * grab it from the system call. Same is true for the SQPOLL offload.
1488 * The only exception is when we've detached the request and issue it
1489 * from an async worker thread, grab the lock for that case.
1490 */
1491 if (issue_flags & IO_URING_F_UNLOCKED)
1492 mutex_lock(&ctx->uring_lock);
1493 lockdep_assert_held(&ctx->uring_lock);
1494}
1495
f237c30a
PB
1496static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1497{
1498 if (!*locked) {
1499 mutex_lock(&ctx->uring_lock);
1500 *locked = true;
1501 }
1502}
1503
f2f87370
PB
1504#define io_for_each_link(pos, head) \
1505 for (pos = (head); pos; pos = pos->link)
1506
21c843d5
PB
1507/*
1508 * Shamelessly stolen from the mm implementation of page reference checking,
1509 * see commit f958d7b528b1 for details.
1510 */
1511#define req_ref_zero_or_close_to_overflow(req) \
1512 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1513
1514static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1515{
20e60a38 1516 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1517 return atomic_inc_not_zero(&req->refs);
1518}
1519
21c843d5
PB
1520static inline bool req_ref_put_and_test(struct io_kiocb *req)
1521{
20e60a38
PB
1522 if (likely(!(req->flags & REQ_F_REFCOUNT)))
1523 return true;
1524
21c843d5
PB
1525 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1526 return atomic_dec_and_test(&req->refs);
1527}
1528
21c843d5
PB
1529static inline void req_ref_get(struct io_kiocb *req)
1530{
20e60a38 1531 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
21c843d5
PB
1532 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1533 atomic_inc(&req->refs);
1534}
1535
c450178d
PB
1536static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1537{
6f33b0bc 1538 if (!wq_list_empty(&ctx->submit_state.compl_reqs))
c450178d
PB
1539 __io_submit_flush_completions(ctx);
1540}
1541
48dcd38d 1542static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
20e60a38
PB
1543{
1544 if (!(req->flags & REQ_F_REFCOUNT)) {
1545 req->flags |= REQ_F_REFCOUNT;
48dcd38d 1546 atomic_set(&req->refs, nr);
20e60a38
PB
1547 }
1548}
1549
48dcd38d
PB
1550static inline void io_req_set_refcount(struct io_kiocb *req)
1551{
1552 __io_req_set_refcount(req, 1);
1553}
1554
ab409402
PB
1555#define IO_RSRC_REF_BATCH 100
1556
25a15d3c
PB
1557static void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
1558{
1559 percpu_ref_put_many(&node->refs, nr);
1560}
1561
ab409402
PB
1562static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
1563 struct io_ring_ctx *ctx)
1564 __must_hold(&ctx->uring_lock)
36f72fe2 1565{
c1bdf8ed 1566 struct io_rsrc_node *node = req->rsrc_node;
ab409402 1567
c1bdf8ed
PB
1568 if (node) {
1569 if (node == ctx->rsrc_node)
ab409402
PB
1570 ctx->rsrc_cached_refs++;
1571 else
25a15d3c 1572 io_rsrc_put_node(node, 1);
ab409402
PB
1573 }
1574}
1575
7ac1edc4 1576static inline void io_req_put_rsrc(struct io_kiocb *req)
ab409402 1577{
c1bdf8ed 1578 if (req->rsrc_node)
25a15d3c 1579 io_rsrc_put_node(req->rsrc_node, 1);
ab409402
PB
1580}
1581
1582static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
1583 __must_hold(&ctx->uring_lock)
1584{
1585 if (ctx->rsrc_cached_refs) {
25a15d3c 1586 io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
ab409402
PB
1587 ctx->rsrc_cached_refs = 0;
1588 }
1589}
1590
1591static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
1592 __must_hold(&ctx->uring_lock)
1593{
1594 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
1595 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
1596}
36f72fe2 1597
a46be971 1598static inline void io_req_set_rsrc_node(struct io_kiocb *req,
5106dd6e
JA
1599 struct io_ring_ctx *ctx,
1600 unsigned int issue_flags)
36f72fe2 1601{
c1bdf8ed
PB
1602 if (!req->rsrc_node) {
1603 req->rsrc_node = ctx->rsrc_node;
5106dd6e
JA
1604
1605 if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1606 lockdep_assert_held(&ctx->uring_lock);
1607 ctx->rsrc_cached_refs--;
1608 if (unlikely(ctx->rsrc_cached_refs < 0))
1609 io_rsrc_refs_refill(ctx);
1610 } else {
c1bdf8ed 1611 percpu_ref_get(&req->rsrc_node->refs);
5106dd6e 1612 }
36f72fe2
PB
1613 }
1614}
1615
cc3cec83 1616static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
3648e526 1617{
3648e526 1618 req->flags &= ~REQ_F_BUFFER_SELECTED;
1dbd023e 1619 list_add(&req->kbuf->list, list);
1dbd023e
JA
1620
1621 return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
3648e526
HX
1622}
1623
cc3cec83 1624static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
3648e526 1625{
8197b053
PB
1626 lockdep_assert_held(&req->ctx->completion_lock);
1627
3648e526
HX
1628 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1629 return 0;
cc3cec83
JA
1630 return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
1631}
1632
1633static inline unsigned int io_put_kbuf(struct io_kiocb *req,
1634 unsigned issue_flags)
1635{
1636 unsigned int cflags;
1637
1638 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1639 return 0;
1640
1641 /*
1642 * We can add this buffer back to two lists:
1643 *
1644 * 1) The io_buffers_cache list. This one is protected by the
1645 * ctx->uring_lock. If we already hold this lock, add back to this
1646 * list as we can grab it from issue as well.
1647 * 2) The io_buffers_comp list. This one is protected by the
1648 * ctx->completion_lock.
1649 *
1650 * We migrate buffers from the comp_list to the issue cache list
1651 * when we need one.
1652 */
1653 if (issue_flags & IO_URING_F_UNLOCKED) {
1654 struct io_ring_ctx *ctx = req->ctx;
1655
1656 spin_lock(&ctx->completion_lock);
1657 cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
1658 spin_unlock(&ctx->completion_lock);
1659 } else {
ab0ac095
PB
1660 lockdep_assert_held(&req->ctx->uring_lock);
1661
cc3cec83
JA
1662 cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
1663 }
1664
1665 return cflags;
3648e526
HX
1666}
1667
dbc7d452
JA
1668static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
1669 unsigned int bgid)
1670{
9cfc7e94
JA
1671 if (ctx->io_bl && bgid < BGID_ARRAY)
1672 return &ctx->io_bl[bgid];
dbc7d452 1673
9cfc7e94 1674 return xa_load(&ctx->io_bl_xa, bgid);
dbc7d452
JA
1675}
1676
4d55f238 1677static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
b1c62645
JA
1678{
1679 struct io_ring_ctx *ctx = req->ctx;
dbc7d452
JA
1680 struct io_buffer_list *bl;
1681 struct io_buffer *buf;
b1c62645
JA
1682
1683 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1684 return;
8a3e8ee5
JA
1685 /* don't recycle if we already did IO to this buffer */
1686 if (req->flags & REQ_F_PARTIAL_IO)
1687 return;
b1c62645 1688
f8929630 1689 io_ring_submit_lock(ctx, issue_flags);
b1c62645
JA
1690
1691 buf = req->kbuf;
dbc7d452
JA
1692 bl = io_buffer_get_list(ctx, buf->bgid);
1693 list_add(&buf->list, &bl->buf_list);
b1c62645 1694 req->flags &= ~REQ_F_BUFFER_SELECTED;
1dbd023e 1695 req->buf_index = buf->bgid;
4d55f238 1696
f8929630 1697 io_ring_submit_unlock(ctx, issue_flags);
b1c62645
JA
1698}
1699
3dd0c97a
PB
1700static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1701 bool cancel_all)
6af3f48b 1702 __must_hold(&req->ctx->timeout_lock)
08d23634 1703{
68207680 1704 if (task && head->task != task)
08d23634 1705 return false;
d5361233 1706 return cancel_all;
6af3f48b
PB
1707}
1708
1709/*
1710 * As io_match_task() but protected against racing with linked timeouts.
1711 * User must not hold timeout_lock.
1712 */
1713static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
1714 bool cancel_all)
1715{
6af3f48b
PB
1716 if (task && head->task != task)
1717 return false;
d5361233 1718 return cancel_all;
6af3f48b
PB
1719}
1720
d886e185
PB
1721static inline bool req_has_async_data(struct io_kiocb *req)
1722{
1723 return req->flags & REQ_F_ASYNC_DATA;
1724}
1725
93d2bcd2 1726static inline void req_set_fail(struct io_kiocb *req)
c40f6379 1727{
93d2bcd2 1728 req->flags |= REQ_F_FAIL;
04c76b41
PB
1729 if (req->flags & REQ_F_CQE_SKIP) {
1730 req->flags &= ~REQ_F_CQE_SKIP;
1731 req->flags |= REQ_F_SKIP_LINK_CQES;
1732 }
c40f6379 1733}
4a38aed2 1734
a8295b98
HX
1735static inline void req_fail_link_node(struct io_kiocb *req, int res)
1736{
1737 req_set_fail(req);
cef216fc 1738 req->cqe.res = res;
a8295b98
HX
1739}
1740
fa05457a
PB
1741static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
1742{
1743 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
a8295b98
HX
1744}
1745
c072481d 1746static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
2b188cc1
JA
1747{
1748 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1749
0f158b4c 1750 complete(&ctx->ref_comp);
2b188cc1
JA
1751}
1752
8eb7e2d0
PB
1753static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1754{
1755 return !req->timeout.off;
1756}
1757
c072481d 1758static __cold void io_fallback_req_func(struct work_struct *work)
f56165e6
PB
1759{
1760 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1761 fallback_work.work);
1762 struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1763 struct io_kiocb *req, *tmp;
f237c30a 1764 bool locked = false;
f56165e6
PB
1765
1766 percpu_ref_get(&ctx->refs);
1767 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
f237c30a 1768 req->io_task_work.func(req, &locked);
5636c00d 1769
f237c30a 1770 if (locked) {
c450178d 1771 io_submit_flush_completions(ctx);
f237c30a
PB
1772 mutex_unlock(&ctx->uring_lock);
1773 }
f56165e6
PB
1774 percpu_ref_put(&ctx->refs);
1775}
1776
c072481d 1777static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
2b188cc1
JA
1778{
1779 struct io_ring_ctx *ctx;
9cfc7e94 1780 int hash_bits;
2b188cc1
JA
1781
1782 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1783 if (!ctx)
1784 return NULL;
1785
9cfc7e94
JA
1786 xa_init(&ctx->io_bl_xa);
1787
78076bb6
JA
1788 /*
1789 * Use 5 bits less than the max cq entries, that should give us around
1790 * 32 entries per hash list if totally full and uniformly spread.
1791 */
1792 hash_bits = ilog2(p->cq_entries);
1793 hash_bits -= 5;
1794 if (hash_bits <= 0)
1795 hash_bits = 1;
1796 ctx->cancel_hash_bits = hash_bits;
1797 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1798 GFP_KERNEL);
1799 if (!ctx->cancel_hash)
1800 goto err;
1801 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1802
6224843d
PB
1803 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1804 if (!ctx->dummy_ubuf)
1805 goto err;
1806 /* set invalid range, so io_import_fixed() fails meeting it */
1807 ctx->dummy_ubuf->ubuf = -1UL;
1808
21482896 1809 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
206aefde
JA
1810 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1811 goto err;
2b188cc1
JA
1812
1813 ctx->flags = p->flags;
90554200 1814 init_waitqueue_head(&ctx->sqo_sq_wait);
69fb2131 1815 INIT_LIST_HEAD(&ctx->sqd_list);
1d7bb1d5 1816 INIT_LIST_HEAD(&ctx->cq_overflow_list);
cc3cec83 1817 INIT_LIST_HEAD(&ctx->io_buffers_cache);
4d9237e3 1818 INIT_LIST_HEAD(&ctx->apoll_cache);
0f158b4c 1819 init_completion(&ctx->ref_comp);
61cf9370 1820 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
2b188cc1 1821 mutex_init(&ctx->uring_lock);
311997b3 1822 init_waitqueue_head(&ctx->cq_wait);
2b188cc1 1823 spin_lock_init(&ctx->completion_lock);
89850fce 1824 spin_lock_init(&ctx->timeout_lock);
5eef4e87 1825 INIT_WQ_LIST(&ctx->iopoll_list);
cc3cec83
JA
1826 INIT_LIST_HEAD(&ctx->io_buffers_pages);
1827 INIT_LIST_HEAD(&ctx->io_buffers_comp);
de0617e4 1828 INIT_LIST_HEAD(&ctx->defer_list);
5262f567 1829 INIT_LIST_HEAD(&ctx->timeout_list);
ef9dd637 1830 INIT_LIST_HEAD(&ctx->ltimeout_list);
d67d2263
BM
1831 spin_lock_init(&ctx->rsrc_ref_lock);
1832 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
269bbe5f
BM
1833 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1834 init_llist_head(&ctx->rsrc_put_llist);
13bf43f5 1835 INIT_LIST_HEAD(&ctx->tctx_list);
c2b6c6bc
PB
1836 ctx->submit_state.free_list.next = NULL;
1837 INIT_WQ_LIST(&ctx->locked_free_list);
9011bf9a 1838 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
6f33b0bc 1839 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
2b188cc1 1840 return ctx;
206aefde 1841err:
6224843d 1842 kfree(ctx->dummy_ubuf);
78076bb6 1843 kfree(ctx->cancel_hash);
9cfc7e94
JA
1844 kfree(ctx->io_bl);
1845 xa_destroy(&ctx->io_bl_xa);
206aefde
JA
1846 kfree(ctx);
1847 return NULL;
2b188cc1
JA
1848}
1849
8f6ed49a
PB
1850static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1851{
1852 struct io_rings *r = ctx->rings;
1853
1854 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1855 ctx->cq_extra--;
1856}
1857
9cf7c104 1858static bool req_need_defer(struct io_kiocb *req, u32 seq)
7adf4eaf 1859{
2bc9930e
JA
1860 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1861 struct io_ring_ctx *ctx = req->ctx;
a197f664 1862
8f6ed49a 1863 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
2bc9930e 1864 }
de0617e4 1865
9d858b21 1866 return false;
de0617e4
JA
1867}
1868
c97d8a0f
PB
1869static inline bool io_req_ffs_set(struct io_kiocb *req)
1870{
35645ac3 1871 return req->flags & REQ_F_FIXED_FILE;
c97d8a0f
PB
1872}
1873
fd08e530
PB
1874static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1875{
906c6caa
PB
1876 if (WARN_ON_ONCE(!req->link))
1877 return NULL;
1878
4d13d1a4
PB
1879 req->flags &= ~REQ_F_ARM_LTIMEOUT;
1880 req->flags |= REQ_F_LINK_TIMEOUT;
fd08e530
PB
1881
1882 /* linked timeouts should have two refs once prep'ed */
48dcd38d 1883 io_req_set_refcount(req);
4d13d1a4
PB
1884 __io_req_set_refcount(req->link, 2);
1885 return req->link;
fd08e530
PB
1886}
1887
1888static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1889{
4d13d1a4 1890 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
fd08e530
PB
1891 return NULL;
1892 return __io_prep_linked_timeout(req);
1893}
1894
cb2d344c
PB
1895static noinline void __io_arm_ltimeout(struct io_kiocb *req)
1896{
1897 io_queue_linked_timeout(__io_prep_linked_timeout(req));
1898}
1899
1900static inline void io_arm_ltimeout(struct io_kiocb *req)
1901{
1902 if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
1903 __io_arm_ltimeout(req);
1904}
1905
1e6fa521
JA
1906static void io_prep_async_work(struct io_kiocb *req)
1907{
1908 const struct io_op_def *def = &io_op_defs[req->opcode];
1e6fa521
JA
1909 struct io_ring_ctx *ctx = req->ctx;
1910
b8e64b53
PB
1911 if (!(req->flags & REQ_F_CREDS)) {
1912 req->flags |= REQ_F_CREDS;
c10d1f98 1913 req->creds = get_current_cred();
b8e64b53 1914 }
003e8dcc 1915
e1d675df
PB
1916 req->work.list.next = NULL;
1917 req->work.flags = 0;
8e29da69 1918 req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
feaadc4f
PB
1919 if (req->flags & REQ_F_FORCE_ASYNC)
1920 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1921
1e6fa521
JA
1922 if (req->flags & REQ_F_ISREG) {
1923 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1924 io_wq_hash_work(&req->work, file_inode(req->file));
4b982bd0 1925 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1e6fa521
JA
1926 if (def->unbound_nonreg_file)
1927 req->work.flags |= IO_WQ_WORK_UNBOUND;
1928 }
561fb04a 1929}
cccf0ee8 1930
cbdcb435 1931static void io_prep_async_link(struct io_kiocb *req)
561fb04a 1932{
cbdcb435 1933 struct io_kiocb *cur;
54a91f3b 1934
44eff40a
PB
1935 if (req->flags & REQ_F_LINK_TIMEOUT) {
1936 struct io_ring_ctx *ctx = req->ctx;
1937
674ee8e1 1938 spin_lock_irq(&ctx->timeout_lock);
44eff40a
PB
1939 io_for_each_link(cur, req)
1940 io_prep_async_work(cur);
674ee8e1 1941 spin_unlock_irq(&ctx->timeout_lock);
44eff40a
PB
1942 } else {
1943 io_for_each_link(cur, req)
1944 io_prep_async_work(cur);
1945 }
561fb04a
JA
1946}
1947
fff4e40e
PB
1948static inline void io_req_add_compl_list(struct io_kiocb *req)
1949{
775a1f2f 1950 struct io_submit_state *state = &req->ctx->submit_state;
fff4e40e 1951
3d4aeb9f 1952 if (!(req->flags & REQ_F_CQE_SKIP))
775a1f2f 1953 state->flush_cqes = true;
fff4e40e
PB
1954 wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1955}
1956
77955efb 1957static void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
561fb04a 1958{
cbdcb435 1959 struct io_kiocb *link = io_prep_linked_timeout(req);
5aa75ed5 1960 struct io_uring_task *tctx = req->task->io_uring;
561fb04a 1961
3bfe6106
JA
1962 BUG_ON(!tctx);
1963 BUG_ON(!tctx->io_wq);
561fb04a 1964
cbdcb435
PB
1965 /* init ->work of the whole link before punting */
1966 io_prep_async_link(req);
991468dc
JA
1967
1968 /*
1969 * Not expected to happen, but if we do have a bug where this _can_
1970 * happen, catch it here and ensure the request is marked as
1971 * canceled. That will make io-wq go through the usual work cancel
1972 * procedure rather than attempt to run this request (or create a new
1973 * worker for it).
1974 */
1975 if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1976 req->work.flags |= IO_WQ_WORK_CANCEL;
1977
971cf9c1
PB
1978 trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
1979 req->opcode, req->flags, &req->work,
1980 io_wq_is_hashed(&req->work));
ebf93667 1981 io_wq_enqueue(tctx->io_wq, &req->work);
7271ef3a
JA
1982 if (link)
1983 io_queue_linked_timeout(link);
cbdcb435
PB
1984}
1985
1ee4160c 1986static void io_kill_timeout(struct io_kiocb *req, int status)
8c855885 1987 __must_hold(&req->ctx->completion_lock)
89850fce 1988 __must_hold(&req->ctx->timeout_lock)
5262f567 1989{
e8c2bc1f 1990 struct io_timeout_data *io = req->async_data;
5262f567 1991
fd9c7bc5 1992 if (hrtimer_try_to_cancel(&io->timer) != -1) {
2ae2eb9d
PB
1993 if (status)
1994 req_set_fail(req);
01cec8c1
PB
1995 atomic_set(&req->ctx->cq_timeouts,
1996 atomic_read(&req->ctx->cq_timeouts) + 1);
135fcde8 1997 list_del_init(&req->timeout.list);
4e118cd9 1998 io_req_tw_post_queue(req, status, 0);
5262f567
JA
1999 }
2000}
2001
c072481d 2002static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
de0617e4 2003{
441b8a78 2004 while (!list_empty(&ctx->defer_list)) {
27dc8338
PB
2005 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
2006 struct io_defer_entry, list);
de0617e4 2007
9cf7c104 2008 if (req_need_defer(de->req, de->seq))
04518945 2009 break;
27dc8338 2010 list_del_init(&de->list);
907d1df3 2011 io_req_task_queue(de->req);
27dc8338 2012 kfree(de);
441b8a78 2013 }
04518945
PB
2014}
2015
c072481d 2016static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
89850fce 2017 __must_hold(&ctx->completion_lock)
de0617e4 2018{
441b8a78 2019 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
e677edbc 2020 struct io_kiocb *req, *tmp;
f010505b 2021
79ebeaee 2022 spin_lock_irq(&ctx->timeout_lock);
e677edbc 2023 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
f010505b 2024 u32 events_needed, events_got;
de0617e4 2025
8eb7e2d0 2026 if (io_is_timeout_noseq(req))
360428f8 2027 break;
f010505b
MDG
2028
2029 /*
2030 * Since seq can easily wrap around over time, subtract
2031 * the last seq at which timeouts were flushed before comparing.
2032 * Assuming not more than 2^31-1 events have happened since,
2033 * these subtractions won't have wrapped, so we can check if
2034 * target is in [last_seq, current_seq] by comparing the two.
2035 */
2036 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
2037 events_got = seq - ctx->cq_last_tm_flush;
2038 if (events_got < events_needed)
360428f8 2039 break;
bfe68a22 2040
1ee4160c 2041 io_kill_timeout(req, 0);
f18ee4cf 2042 }
f010505b 2043 ctx->cq_last_tm_flush = seq;
79ebeaee 2044 spin_unlock_irq(&ctx->timeout_lock);
360428f8 2045}
5262f567 2046
9333f6b4
PB
2047static inline void io_commit_cqring(struct io_ring_ctx *ctx)
2048{
2049 /* order cqe stores with ring update */
2050 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
2051}
2052
9aa8dfde 2053static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
360428f8 2054{
9aa8dfde
PB
2055 if (ctx->off_timeout_used || ctx->drain_active) {
2056 spin_lock(&ctx->completion_lock);
2057 if (ctx->off_timeout_used)
2058 io_flush_timeouts(ctx);
2059 if (ctx->drain_active)
2060 io_queue_deferred(ctx);
2061 io_commit_cqring(ctx);
2062 spin_unlock(&ctx->completion_lock);
2063 }
2064 if (ctx->has_evfd)
2065 io_eventfd_signal(ctx);
de0617e4
JA
2066}
2067
90554200
JA
2068static inline bool io_sqring_full(struct io_ring_ctx *ctx)
2069{
2070 struct io_rings *r = ctx->rings;
2071
a566c556 2072 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
90554200
JA
2073}
2074
888aae2e
PB
2075static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
2076{
2077 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
2078}
2079
d8da428b
PB
2080/*
2081 * writes to the cq entry need to come after reading head; the
2082 * control dependency is enough as we're using WRITE_ONCE to
2083 * fill the cq entry
2084 */
2085static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
2b188cc1 2086{
75b28aff 2087 struct io_rings *rings = ctx->rings;
d8da428b 2088 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
2fee6bc6 2089 unsigned int shift = 0;
d8da428b
PB
2090 unsigned int free, queued, len;
2091
2fee6bc6
SR
2092 if (ctx->flags & IORING_SETUP_CQE32)
2093 shift = 1;
2094
d8da428b
PB
2095 /* userspace may cheat modifying the tail, be safe and do min */
2096 queued = min(__io_cqring_events(ctx), ctx->cq_entries);
2097 free = ctx->cq_entries - queued;
2098 /* we need a contiguous range, limit based on the current array offset */
2099 len = min(free, ctx->cq_entries - off);
2100 if (!len)
2b188cc1
JA
2101 return NULL;
2102
d8da428b
PB
2103 ctx->cached_cq_tail++;
2104 ctx->cqe_cached = &rings->cqes[off];
2105 ctx->cqe_sentinel = ctx->cqe_cached + len;
2fee6bc6
SR
2106 ctx->cqe_cached++;
2107 return &rings->cqes[off << shift];
d8da428b
PB
2108}
2109
2110static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
2111{
2112 if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
2fee6bc6
SR
2113 struct io_uring_cqe *cqe = ctx->cqe_cached;
2114
2115 if (ctx->flags & IORING_SETUP_CQE32) {
2116 unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
2117
2118 cqe += off;
2119 }
2120
d8da428b 2121 ctx->cached_cq_tail++;
2fee6bc6
SR
2122 ctx->cqe_cached++;
2123 return cqe;
d8da428b 2124 }
2fee6bc6 2125
d8da428b 2126 return __io_get_cqe(ctx);
2b188cc1
JA
2127}
2128
77bc59b4 2129static void io_eventfd_signal(struct io_ring_ctx *ctx)
f2842ab5 2130{
77bc59b4
UA
2131 struct io_ev_fd *ev_fd;
2132
77bc59b4
UA
2133 rcu_read_lock();
2134 /*
2135 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
2136 * and eventfd_signal
2137 */
2138 ev_fd = rcu_dereference(ctx->io_ev_fd);
2139
2140 /*
2141 * Check again if ev_fd exists incase an io_eventfd_unregister call
2142 * completed between the NULL check of ctx->io_ev_fd at the start of
2143 * the function and rcu_read_lock.
2144 */
2145 if (unlikely(!ev_fd))
2146 goto out;
7e55a19c 2147 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
77bc59b4
UA
2148 goto out;
2149
c75312dd 2150 if (!ev_fd->eventfd_async || io_wq_current_is_worker())
77bc59b4 2151 eventfd_signal(ev_fd->cq_ev_fd, 1);
77bc59b4
UA
2152out:
2153 rcu_read_unlock();
f2842ab5
JA
2154}
2155
9aa8dfde
PB
2156static inline void io_cqring_wake(struct io_ring_ctx *ctx)
2157{
2158 /*
2159 * wake_up_all() may seem excessive, but io_wake_function() and
2160 * io_should_wake() handle the termination of the loop and only
2161 * wake as many waiters as we need to.
2162 */
2163 if (wq_has_sleeper(&ctx->cq_wait))
2164 wake_up_all(&ctx->cq_wait);
2165}
2166
2c5d763c
JA
2167/*
2168 * This should only get called when at least one event has been posted.
2169 * Some applications rely on the eventfd notification count only changing
2170 * IFF a new CQE has been added to the CQ ring. There's no depedency on
2171 * 1:1 relationship between how many times this function is called (and
2172 * hence the eventfd count) and number of CQEs posted to the CQ ring.
2173 */
66fc25ca 2174static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1d7bb1d5 2175{
9aa8dfde
PB
2176 if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
2177 ctx->has_evfd))
9333f6b4
PB
2178 __io_commit_cqring_flush(ctx);
2179
9aa8dfde 2180 io_cqring_wake(ctx);
1d7bb1d5
JA
2181}
2182
80c18e4a
PB
2183static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
2184{
9aa8dfde
PB
2185 if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
2186 ctx->has_evfd))
9333f6b4
PB
2187 __io_commit_cqring_flush(ctx);
2188
9aa8dfde
PB
2189 if (ctx->flags & IORING_SETUP_SQPOLL)
2190 io_cqring_wake(ctx);
80c18e4a
PB
2191}
2192
c4a2ed72 2193/* Returns true if there are no backlogged entries after the flush */
6c2450ae 2194static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1d7bb1d5 2195{
b18032bb 2196 bool all_flushed, posted;
e45a3e05 2197 size_t cqe_size = sizeof(struct io_uring_cqe);
1d7bb1d5 2198
a566c556 2199 if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
e23de15f 2200 return false;
1d7bb1d5 2201
e45a3e05
SR
2202 if (ctx->flags & IORING_SETUP_CQE32)
2203 cqe_size <<= 1;
2204
b18032bb 2205 posted = false;
79ebeaee 2206 spin_lock(&ctx->completion_lock);
6c2450ae 2207 while (!list_empty(&ctx->cq_overflow_list)) {
d068b506 2208 struct io_uring_cqe *cqe = io_get_cqe(ctx);
6c2450ae 2209 struct io_overflow_cqe *ocqe;
e6c8aa9a 2210
1d7bb1d5
JA
2211 if (!cqe && !force)
2212 break;
6c2450ae
PB
2213 ocqe = list_first_entry(&ctx->cq_overflow_list,
2214 struct io_overflow_cqe, list);
2215 if (cqe)
e45a3e05 2216 memcpy(cqe, &ocqe->cqe, cqe_size);
6c2450ae 2217 else
8f6ed49a
PB
2218 io_account_cq_overflow(ctx);
2219
b18032bb 2220 posted = true;
6c2450ae
PB
2221 list_del(&ocqe->list);
2222 kfree(ocqe);
1d7bb1d5
JA
2223 }
2224
09e88404
PB
2225 all_flushed = list_empty(&ctx->cq_overflow_list);
2226 if (all_flushed) {
10988a0a 2227 clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
3a4b89a2 2228 atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
09e88404 2229 }
46930143 2230
60053be8 2231 io_commit_cqring(ctx);
79ebeaee 2232 spin_unlock(&ctx->completion_lock);
b18032bb
JA
2233 if (posted)
2234 io_cqring_ev_posted(ctx);
09e88404 2235 return all_flushed;
1d7bb1d5
JA
2236}
2237
90f67366 2238static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
6c503150 2239{
ca0a2651
JA
2240 bool ret = true;
2241
10988a0a 2242 if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
6c503150
PB
2243 /* iopoll syncs against uring_lock, not completion_lock */
2244 if (ctx->flags & IORING_SETUP_IOPOLL)
2245 mutex_lock(&ctx->uring_lock);
90f67366 2246 ret = __io_cqring_overflow_flush(ctx, false);
6c503150
PB
2247 if (ctx->flags & IORING_SETUP_IOPOLL)
2248 mutex_unlock(&ctx->uring_lock);
2249 }
ca0a2651
JA
2250
2251 return ret;
6c503150
PB
2252}
2253
9d170164 2254static void __io_put_task(struct task_struct *task, int nr)
6a290a14
PB
2255{
2256 struct io_uring_task *tctx = task->io_uring;
2257
9d170164
PB
2258 percpu_counter_sub(&tctx->inflight, nr);
2259 if (unlikely(atomic_read(&tctx->in_idle)))
2260 wake_up(&tctx->wait);
2261 put_task_struct_many(task, nr);
2262}
2263
2264/* must to be called somewhat shortly after putting a request */
2265static inline void io_put_task(struct task_struct *task, int nr)
2266{
2267 if (likely(task == current))
2268 task->io_uring->cached_refs += nr;
2269 else
2270 __io_put_task(task, nr);
6a290a14
PB
2271}
2272
9a10867a
PB
2273static void io_task_refs_refill(struct io_uring_task *tctx)
2274{
2275 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
2276
2277 percpu_counter_add(&tctx->inflight, refill);
2278 refcount_add(refill, &current->usage);
2279 tctx->cached_refs += refill;
2280}
2281
2282static inline void io_get_task_refs(int nr)
2283{
2284 struct io_uring_task *tctx = current->io_uring;
2285
2286 tctx->cached_refs -= nr;
2287 if (unlikely(tctx->cached_refs < 0))
2288 io_task_refs_refill(tctx);
2289}
2290
3cc7fdb9
PB
2291static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
2292{
2293 struct io_uring_task *tctx = task->io_uring;
2294 unsigned int refs = tctx->cached_refs;
2295
2296 if (refs) {
2297 tctx->cached_refs = 0;
2298 percpu_counter_sub(&tctx->inflight, refs);
2299 put_task_struct_many(task, refs);
2300 }
2301}
2302
d4d19c19 2303static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
e45a3e05
SR
2304 s32 res, u32 cflags, u64 extra1,
2305 u64 extra2)
2b188cc1 2306{
cce4b8b0 2307 struct io_overflow_cqe *ocqe;
e45a3e05
SR
2308 size_t ocq_size = sizeof(struct io_overflow_cqe);
2309 bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
2310
2311 if (is_cqe32)
2312 ocq_size += sizeof(struct io_uring_cqe);
2b188cc1 2313
e45a3e05 2314 ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
08dcd028 2315 trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
cce4b8b0
PB
2316 if (!ocqe) {
2317 /*
2318 * If we're in ring overflow flush mode, or in task cancel mode,
2319 * or cannot allocate an overflow entry, then we need to drop it
2320 * on the floor.
2321 */
8f6ed49a 2322 io_account_cq_overflow(ctx);
155bc950 2323 set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
cce4b8b0 2324 return false;
2b188cc1 2325 }
cce4b8b0 2326 if (list_empty(&ctx->cq_overflow_list)) {
10988a0a 2327 set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
3a4b89a2 2328 atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
20c0b380 2329
cce4b8b0 2330 }
d4d19c19 2331 ocqe->cqe.user_data = user_data;
cce4b8b0
PB
2332 ocqe->cqe.res = res;
2333 ocqe->cqe.flags = cflags;
e45a3e05
SR
2334 if (is_cqe32) {
2335 ocqe->cqe.big_cqe[0] = extra1;
2336 ocqe->cqe.big_cqe[1] = extra2;
2337 }
cce4b8b0
PB
2338 list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
2339 return true;
2b188cc1
JA
2340}
2341
ae4da189 2342static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
913a571a 2343 s32 res, u32 cflags)
2b188cc1
JA
2344{
2345 struct io_uring_cqe *cqe;
2346
2347 /*
2348 * If we can't get a cq entry, userspace overflowed the
2349 * submission (by quite a lot). Increment the overflow count in
2350 * the ring.
2351 */
d068b506 2352 cqe = io_get_cqe(ctx);
1d7bb1d5 2353 if (likely(cqe)) {
d4d19c19 2354 WRITE_ONCE(cqe->user_data, user_data);
2b188cc1 2355 WRITE_ONCE(cqe->res, res);
bcda7baa 2356 WRITE_ONCE(cqe->flags, cflags);
8d13326e 2357 return true;
2b188cc1 2358 }
e45a3e05 2359 return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
2b188cc1
JA
2360}
2361
90e7c35f
PB
2362static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
2363 struct io_kiocb *req)
d5ec1dfa 2364{
90e7c35f
PB
2365 struct io_uring_cqe *cqe;
2366
2367 trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
c4bb964f 2368 req->cqe.res, req->cqe.flags, 0, 0);
90e7c35f
PB
2369
2370 /*
2371 * If we can't get a cq entry, userspace overflowed the
2372 * submission (by quite a lot). Increment the overflow count in
2373 * the ring.
2374 */
2375 cqe = io_get_cqe(ctx);
2376 if (likely(cqe)) {
2377 memcpy(cqe, &req->cqe, sizeof(*cqe));
2378 return true;
2379 }
2380 return io_cqring_event_overflow(ctx, req->cqe.user_data,
e45a3e05 2381 req->cqe.res, req->cqe.flags, 0, 0);
d5ec1dfa
SR
2382}
2383
91658798
SR
2384static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx,
2385 struct io_kiocb *req)
2386{
2387 struct io_uring_cqe *cqe;
2388 u64 extra1 = req->extra1;
2389 u64 extra2 = req->extra2;
2390
2391 trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
c4bb964f 2392 req->cqe.res, req->cqe.flags, extra1, extra2);
91658798
SR
2393
2394 /*
2395 * If we can't get a cq entry, userspace overflowed the
2396 * submission (by quite a lot). Increment the overflow count in
2397 * the ring.
2398 */
2399 cqe = io_get_cqe(ctx);
2400 if (likely(cqe)) {
2401 memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
2402 cqe->big_cqe[0] = extra1;
2403 cqe->big_cqe[1] = extra2;
2404 return true;
2405 }
2406
e45a3e05
SR
2407 return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res,
2408 req->cqe.flags, extra1, extra2);
91658798
SR
2409}
2410
ae4da189 2411static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
bcda7baa 2412{
c4bb964f 2413 trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0);
cef216fc 2414 return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
bcda7baa
JA
2415}
2416
91658798
SR
2417static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags,
2418 u64 extra1, u64 extra2)
2419{
2420 struct io_ring_ctx *ctx = req->ctx;
2421 struct io_uring_cqe *cqe;
2422
2423 if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
2424 return;
2425 if (req->flags & REQ_F_CQE_SKIP)
2426 return;
2427
c4bb964f
SR
2428 trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags,
2429 extra1, extra2);
91658798
SR
2430
2431 /*
2432 * If we can't get a cq entry, userspace overflowed the
2433 * submission (by quite a lot). Increment the overflow count in
2434 * the ring.
2435 */
2436 cqe = io_get_cqe(ctx);
2437 if (likely(cqe)) {
2438 WRITE_ONCE(cqe->user_data, req->cqe.user_data);
2439 WRITE_ONCE(cqe->res, res);
2440 WRITE_ONCE(cqe->flags, cflags);
2441 WRITE_ONCE(cqe->big_cqe[0], extra1);
2442 WRITE_ONCE(cqe->big_cqe[1], extra2);
2443 return;
2444 }
2445
e45a3e05 2446 io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2);
91658798
SR
2447}
2448
913a571a
PB
2449static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
2450 s32 res, u32 cflags)
bcda7baa 2451{
913a571a 2452 ctx->cq_extra++;
c4bb964f 2453 trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
ae4da189 2454 return __io_fill_cqe(ctx, user_data, res, cflags);
bcda7baa
JA
2455}
2456
effcf8bd 2457static void __io_req_complete_put(struct io_kiocb *req)
2b188cc1 2458{
c7dae4ba
JA
2459 /*
2460 * If we're the last reference to this request, add to our locked
2461 * free_list cache.
2462 */
de9b4cca 2463 if (req_ref_put_and_test(req)) {
effcf8bd
SR
2464 struct io_ring_ctx *ctx = req->ctx;
2465
da1a08c5 2466 if (req->flags & IO_REQ_LINK_FLAGS) {
0756a869 2467 if (req->flags & IO_DISARM_MASK)
7a612350
PB
2468 io_disarm_next(req);
2469 if (req->link) {
2470 io_req_task_queue(req->link);
2471 req->link = NULL;
2472 }
2473 }
7ac1edc4 2474 io_req_put_rsrc(req);
8197b053
PB
2475 /*
2476 * Selected buffer deallocation in io_clean_op() assumes that
2477 * we don't hold ->completion_lock. Clean them here to avoid
2478 * deadlocks.
2479 */
2480 io_put_kbuf_comp(req);
c7dae4ba
JA
2481 io_dismantle_req(req);
2482 io_put_task(req->task, 1);
c2b6c6bc 2483 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
d0acdee2 2484 ctx->locked_free_nr++;
180f829f 2485 }
a37fae8a
HX
2486}
2487
effcf8bd
SR
2488static void __io_req_complete_post(struct io_kiocb *req, s32 res,
2489 u32 cflags)
2490{
2491 if (!(req->flags & REQ_F_CQE_SKIP))
2492 __io_fill_cqe_req(req, res, cflags);
2493 __io_req_complete_put(req);
2494}
2495
2496static void __io_req_complete_post32(struct io_kiocb *req, s32 res,
2497 u32 cflags, u64 extra1, u64 extra2)
2498{
2499 if (!(req->flags & REQ_F_CQE_SKIP))
2500 __io_fill_cqe32_req(req, res, cflags, extra1, extra2);
2501 __io_req_complete_put(req);
2502}
2503
2504static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
a37fae8a
HX
2505{
2506 struct io_ring_ctx *ctx = req->ctx;
2507
2508 spin_lock(&ctx->completion_lock);
2509 __io_req_complete_post(req, res, cflags);
7a612350 2510 io_commit_cqring(ctx);
79ebeaee 2511 spin_unlock(&ctx->completion_lock);
a3f34907 2512 io_cqring_ev_posted(ctx);
4e3d9ff9
JA
2513}
2514
effcf8bd
SR
2515static void io_req_complete_post32(struct io_kiocb *req, s32 res,
2516 u32 cflags, u64 extra1, u64 extra2)
2517{
2518 struct io_ring_ctx *ctx = req->ctx;
2519
2520 spin_lock(&ctx->completion_lock);
2521 __io_req_complete_post32(req, res, cflags, extra1, extra2);
2522 io_commit_cqring(ctx);
2523 spin_unlock(&ctx->completion_lock);
2524 io_cqring_ev_posted(ctx);
2525}
2526
54daa9b2
PB
2527static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
2528 u32 cflags)
229a7b63 2529{
cef216fc
PB
2530 req->cqe.res = res;
2531 req->cqe.flags = cflags;
e342c807 2532 req->flags |= REQ_F_COMPLETE_INLINE;
e1e16097
JA
2533}
2534
889fca73 2535static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
54daa9b2 2536 s32 res, u32 cflags)
bcda7baa 2537{
889fca73
PB
2538 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
2539 io_req_complete_state(req, res, cflags);
a38d68db 2540 else
c7dae4ba 2541 io_req_complete_post(req, res, cflags);
bcda7baa
JA
2542}
2543
effcf8bd
SR
2544static inline void __io_req_complete32(struct io_kiocb *req,
2545 unsigned int issue_flags, s32 res,
2546 u32 cflags, u64 extra1, u64 extra2)
2547{
2548 if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
2549 io_req_complete_state(req, res, cflags);
2550 req->extra1 = extra1;
2551 req->extra2 = extra2;
2552 } else {
2553 io_req_complete_post32(req, res, cflags, extra1, extra2);
2554 }
2555}
2556
54daa9b2 2557static inline void io_req_complete(struct io_kiocb *req, s32 res)
0ddf92e8 2558{
889fca73 2559 __io_req_complete(req, 0, res, 0);
0ddf92e8
JA
2560}
2561
54daa9b2 2562static void io_req_complete_failed(struct io_kiocb *req, s32 res)
f41db273 2563{
93d2bcd2 2564 req_set_fail(req);
ab0ac095 2565 io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
f41db273
PB
2566}
2567
864ea921
PB
2568/*
2569 * Don't initialise the fields below on every allocation, but do that in
2570 * advance and keep them valid across allocations.
2571 */
2572static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
2573{
2574 req->ctx = ctx;
2575 req->link = NULL;
2576 req->async_data = NULL;
2577 /* not necessary, but safer to zero */
cef216fc 2578 req->cqe.res = 0;
864ea921
PB
2579}
2580
dac7a098 2581static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
cd0ca2e0 2582 struct io_submit_state *state)
dac7a098 2583{
79ebeaee 2584 spin_lock(&ctx->completion_lock);
c2b6c6bc 2585 wq_list_splice(&ctx->locked_free_list, &state->free_list);
d0acdee2 2586 ctx->locked_free_nr = 0;
79ebeaee 2587 spin_unlock(&ctx->completion_lock);
dac7a098
PB
2588}
2589
88ab95be 2590static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
0ddf92e8 2591{
88ab95be 2592 return !ctx->submit_state.free_list.next;
0ddf92e8
JA
2593}
2594
5d5901a3
PB
2595/*
2596 * A request might get retired back into the request caches even before opcode
2597 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
2598 * Because of that, io_alloc_req() should be called only under ->uring_lock
2599 * and with extra caution to not get a request that is still worked on.
2600 */
c072481d 2601static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
5d5901a3 2602 __must_hold(&ctx->uring_lock)
2b188cc1 2603{
864ea921 2604 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
3ab665b7 2605 void *reqs[IO_REQ_ALLOC_BATCH];
864ea921 2606 int ret, i;
e5d1bc0a 2607
23a5c43b
PB
2608 /*
2609 * If we have more than a batch's worth of requests in our IRQ side
2610 * locked cache, grab the lock and move them over to our submission
2611 * side cache.
2612 */
a6d97a8a 2613 if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
23a5c43b 2614 io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
88ab95be 2615 if (!io_req_cache_empty(ctx))
23a5c43b
PB
2616 return true;
2617 }
e5d1bc0a 2618
3ab665b7 2619 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
fd6fab2c 2620
864ea921
PB
2621 /*
2622 * Bulk alloc is all-or-nothing. If we fail to get a batch,
2623 * retry single alloc to be on the safe side.
2624 */
2625 if (unlikely(ret <= 0)) {
3ab665b7
PB
2626 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2627 if (!reqs[0])
a33ae9ce 2628 return false;
864ea921 2629 ret = 1;
2b188cc1 2630 }
864ea921 2631
37f0e767 2632 percpu_ref_get_many(&ctx->refs, ret);
3ab665b7 2633 for (i = 0; i < ret; i++) {
23a5c43b 2634 struct io_kiocb *req = reqs[i];
3ab665b7
PB
2635
2636 io_preinit_req(req, ctx);
fa05457a 2637 io_req_add_to_cache(req, ctx);
3ab665b7 2638 }
a33ae9ce
PB
2639 return true;
2640}
2641
2642static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
2643{
88ab95be 2644 if (unlikely(io_req_cache_empty(ctx)))
a33ae9ce
PB
2645 return __io_alloc_req_refill(ctx);
2646 return true;
2647}
2648
2649static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2650{
2651 struct io_wq_work_node *node;
2652
2653 node = wq_stack_extract(&ctx->submit_state.free_list);
c2b6c6bc 2654 return container_of(node, struct io_kiocb, comp_list);
2b188cc1
JA
2655}
2656
e1d767f0 2657static inline void io_put_file(struct file *file)
8da11c19 2658{
e1d767f0 2659 if (file)
8da11c19
PB
2660 fput(file);
2661}
2662
6b639522 2663static inline void io_dismantle_req(struct io_kiocb *req)
2b188cc1 2664{
094bae49 2665 unsigned int flags = req->flags;
929a3af9 2666
867f8fa5 2667 if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
3a0a6902 2668 io_clean_op(req);
e1d767f0
PB
2669 if (!(flags & REQ_F_FIXED_FILE))
2670 io_put_file(req->file);
e65ef56d
JA
2671}
2672
f5c6cf2a 2673static __cold void io_free_req(struct io_kiocb *req)
c6ca97b3 2674{
51a4cc11 2675 struct io_ring_ctx *ctx = req->ctx;
c6ca97b3 2676
7ac1edc4 2677 io_req_put_rsrc(req);
216578e5 2678 io_dismantle_req(req);
7c660731 2679 io_put_task(req->task, 1);
c6ca97b3 2680
79ebeaee 2681 spin_lock(&ctx->completion_lock);
c2b6c6bc 2682 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
c34b025f 2683 ctx->locked_free_nr++;
79ebeaee 2684 spin_unlock(&ctx->completion_lock);
e65ef56d
JA
2685}
2686
f2f87370
PB
2687static inline void io_remove_next_linked(struct io_kiocb *req)
2688{
2689 struct io_kiocb *nxt = req->link;
2690
2691 req->link = nxt->link;
2692 nxt->link = NULL;
2693}
2694
81ec803b 2695static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
33cc89a9 2696 __must_hold(&req->ctx->completion_lock)
89b263f6 2697 __must_hold(&req->ctx->timeout_lock)
2665abfd 2698{
33cc89a9 2699 struct io_kiocb *link = req->link;
f2f87370 2700
b97e736a 2701 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
c9abd7ad 2702 struct io_timeout_data *io = link->async_data;
7c86ffee 2703
f2f87370 2704 io_remove_next_linked(req);
90cd7e42 2705 link->timeout.head = NULL;
fd9c7bc5 2706 if (hrtimer_try_to_cancel(&io->timer) != -1) {
ef9dd637 2707 list_del(&link->timeout.list);
81ec803b 2708 return link;
c9abd7ad
PB
2709 }
2710 }
81ec803b 2711 return NULL;
7c86ffee
PB
2712}
2713
d148ca4b 2714static void io_fail_links(struct io_kiocb *req)
33cc89a9 2715 __must_hold(&req->ctx->completion_lock)
9e645e11 2716{
33cc89a9 2717 struct io_kiocb *nxt, *link = req->link;
04c76b41 2718 bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
9e645e11 2719
f2f87370 2720 req->link = NULL;
f2f87370 2721 while (link) {
a8295b98
HX
2722 long res = -ECANCELED;
2723
2724 if (link->flags & REQ_F_FAIL)
cef216fc 2725 res = link->cqe.res;
a8295b98 2726
f2f87370
PB
2727 nxt = link->link;
2728 link->link = NULL;
2665abfd 2729
cef216fc 2730 trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data,
502c87d6
SR
2731 req->opcode, link);
2732
4e118cd9
PB
2733 if (ignore_cqes)
2734 link->flags |= REQ_F_CQE_SKIP;
2735 else
04c76b41 2736 link->flags &= ~REQ_F_CQE_SKIP;
4e118cd9 2737 __io_req_complete_post(link, res, 0);
f2f87370 2738 link = nxt;
9e645e11 2739 }
33cc89a9 2740}
9e645e11 2741
33cc89a9
PB
2742static bool io_disarm_next(struct io_kiocb *req)
2743 __must_hold(&req->ctx->completion_lock)
2744{
81ec803b 2745 struct io_kiocb *link = NULL;
33cc89a9
PB
2746 bool posted = false;
2747
0756a869 2748 if (req->flags & REQ_F_ARM_LTIMEOUT) {
81ec803b 2749 link = req->link;
906c6caa 2750 req->flags &= ~REQ_F_ARM_LTIMEOUT;
0756a869
PB
2751 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2752 io_remove_next_linked(req);
4e118cd9 2753 io_req_tw_post_queue(link, -ECANCELED, 0);
0756a869
PB
2754 posted = true;
2755 }
2756 } else if (req->flags & REQ_F_LINK_TIMEOUT) {
89b263f6
JA
2757 struct io_ring_ctx *ctx = req->ctx;
2758
2759 spin_lock_irq(&ctx->timeout_lock);
81ec803b 2760 link = io_disarm_linked_timeout(req);
89b263f6 2761 spin_unlock_irq(&ctx->timeout_lock);
81ec803b
PB
2762 if (link) {
2763 posted = true;
2764 io_req_tw_post_queue(link, -ECANCELED, 0);
2765 }
89b263f6 2766 }
93d2bcd2 2767 if (unlikely((req->flags & REQ_F_FAIL) &&
e4335ed3 2768 !(req->flags & REQ_F_HARDLINK))) {
33cc89a9
PB
2769 posted |= (req->link != NULL);
2770 io_fail_links(req);
2771 }
2772 return posted;
9e645e11
JA
2773}
2774
d81499bf
PB
2775static void __io_req_find_next_prep(struct io_kiocb *req)
2776{
2777 struct io_ring_ctx *ctx = req->ctx;
2778 bool posted;
2779
2780 spin_lock(&ctx->completion_lock);
2781 posted = io_disarm_next(req);
60053be8 2782 io_commit_cqring(ctx);
d81499bf
PB
2783 spin_unlock(&ctx->completion_lock);
2784 if (posted)
2785 io_cqring_ev_posted(ctx);
2786}
2787
2788static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
c69f8dbe 2789{
33cc89a9 2790 struct io_kiocb *nxt;
944e58bf 2791
9e645e11
JA
2792 /*
2793 * If LINK is set, we have dependent requests in this chain. If we
2794 * didn't fail this request, queue the first one up, moving any other
2795 * dependencies to the next request. In case of failure, fail the rest
2796 * of the chain.
2797 */
d81499bf
PB
2798 if (unlikely(req->flags & IO_DISARM_MASK))
2799 __io_req_find_next_prep(req);
33cc89a9
PB
2800 nxt = req->link;
2801 req->link = NULL;
2802 return nxt;
4d7dd462 2803}
9e645e11 2804
f237c30a 2805static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2c32395d
PB
2806{
2807 if (!ctx)
2808 return;
ef060ea9
JA
2809 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
2810 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
f237c30a 2811 if (*locked) {
c450178d 2812 io_submit_flush_completions(ctx);
2c32395d 2813 mutex_unlock(&ctx->uring_lock);
f237c30a 2814 *locked = false;
2c32395d
PB
2815 }
2816 percpu_ref_put(&ctx->refs);
2817}
2818
f28c240e
HX
2819static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
2820{
2821 io_commit_cqring(ctx);
2822 spin_unlock(&ctx->completion_lock);
2823 io_cqring_ev_posted(ctx);
2824}
2825
2826static void handle_prev_tw_list(struct io_wq_work_node *node,
2827 struct io_ring_ctx **ctx, bool *uring_locked)
2828{
2829 if (*ctx && !*uring_locked)
2830 spin_lock(&(*ctx)->completion_lock);
2831
2832 do {
2833 struct io_wq_work_node *next = node->next;
2834 struct io_kiocb *req = container_of(node, struct io_kiocb,
2835 io_task_work.node);
2836
34d2bfe7
JA
2837 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2838
f28c240e
HX
2839 if (req->ctx != *ctx) {
2840 if (unlikely(!*uring_locked && *ctx))
2841 ctx_commit_and_unlock(*ctx);
2842
2843 ctx_flush_and_put(*ctx, uring_locked);
2844 *ctx = req->ctx;
2845 /* if not contended, grab and improve batching */
2846 *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
2847 percpu_ref_get(&(*ctx)->refs);
2848 if (unlikely(!*uring_locked))
2849 spin_lock(&(*ctx)->completion_lock);
2850 }
2851 if (likely(*uring_locked))
2852 req->io_task_work.func(req, uring_locked);
2853 else
cef216fc 2854 __io_req_complete_post(req, req->cqe.res,
cc3cec83 2855 io_put_kbuf_comp(req));
f28c240e
HX
2856 node = next;
2857 } while (node);
2858
2859 if (unlikely(!*uring_locked))
2860 ctx_commit_and_unlock(*ctx);
2861}
2862
2863static void handle_tw_list(struct io_wq_work_node *node,
2864 struct io_ring_ctx **ctx, bool *locked)
9f8d032a
HX
2865{
2866 do {
2867 struct io_wq_work_node *next = node->next;
2868 struct io_kiocb *req = container_of(node, struct io_kiocb,
2869 io_task_work.node);
2870
34d2bfe7
JA
2871 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2872
9f8d032a
HX
2873 if (req->ctx != *ctx) {
2874 ctx_flush_and_put(*ctx, locked);
2875 *ctx = req->ctx;
2876 /* if not contended, grab and improve batching */
2877 *locked = mutex_trylock(&(*ctx)->uring_lock);
2878 percpu_ref_get(&(*ctx)->refs);
2879 }
2880 req->io_task_work.func(req, locked);
2881 node = next;
2882 } while (node);
2883}
2884
7cbf1722 2885static void tctx_task_work(struct callback_head *cb)
c40f6379 2886{
f28c240e 2887 bool uring_locked = false;
ebd0df2e 2888 struct io_ring_ctx *ctx = NULL;
3f18407d
PB
2889 struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2890 task_work);
c40f6379 2891
16f72070 2892 while (1) {
f28c240e 2893 struct io_wq_work_node *node1, *node2;
3f18407d
PB
2894
2895 spin_lock_irq(&tctx->task_lock);
f28c240e
HX
2896 node1 = tctx->prior_task_list.first;
2897 node2 = tctx->task_list.first;
3f18407d 2898 INIT_WQ_LIST(&tctx->task_list);
f28c240e
HX
2899 INIT_WQ_LIST(&tctx->prior_task_list);
2900 if (!node2 && !node1)
6294f368 2901 tctx->task_running = false;
3f18407d 2902 spin_unlock_irq(&tctx->task_lock);
f28c240e 2903 if (!node2 && !node1)
6294f368 2904 break;
3f18407d 2905
f28c240e
HX
2906 if (node1)
2907 handle_prev_tw_list(node1, &ctx, &uring_locked);
f28c240e
HX
2908 if (node2)
2909 handle_tw_list(node2, &ctx, &uring_locked);
7cbf1722 2910 cond_resched();
68ca8fc0 2911
a6d97a8a
PB
2912 if (data_race(!tctx->task_list.first) &&
2913 data_race(!tctx->prior_task_list.first) && uring_locked)
68ca8fc0 2914 io_submit_flush_completions(ctx);
3f18407d 2915 }
ebd0df2e 2916
f28c240e 2917 ctx_flush_and_put(ctx, &uring_locked);
3cc7fdb9
PB
2918
2919 /* relaxed read is enough as only the task itself sets ->in_idle */
2920 if (unlikely(atomic_read(&tctx->in_idle)))
2921 io_uring_drop_tctx_refs(current);
7cbf1722
JA
2922}
2923
4813c377 2924static void io_req_task_work_add(struct io_kiocb *req, bool priority)
7cbf1722 2925{
c15b79de 2926 struct task_struct *tsk = req->task;
9f010507 2927 struct io_ring_ctx *ctx = req->ctx;
7cbf1722 2928 struct io_uring_task *tctx = tsk->io_uring;
e09ee510 2929 struct io_wq_work_node *node;
0b81e80c 2930 unsigned long flags;
6294f368 2931 bool running;
7cbf1722
JA
2932
2933 WARN_ON_ONCE(!tctx);
2934
d5361233
JA
2935 io_drop_inflight_file(req);
2936
0b81e80c 2937 spin_lock_irqsave(&tctx->task_lock, flags);
4813c377
HX
2938 if (priority)
2939 wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
2940 else
2941 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
6294f368
PB
2942 running = tctx->task_running;
2943 if (!running)
2944 tctx->task_running = true;
0b81e80c 2945 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722
JA
2946
2947 /* task_work already pending, we're done */
6294f368 2948 if (running)
e09ee510 2949 return;
7cbf1722 2950
ef060ea9
JA
2951 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
2952 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
2953
9f010507 2954 if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method)))
e09ee510 2955 return;
2215bed9 2956
0b81e80c 2957 spin_lock_irqsave(&tctx->task_lock, flags);
6294f368 2958 tctx->task_running = false;
4813c377 2959 node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
0b81e80c 2960 spin_unlock_irqrestore(&tctx->task_lock, flags);
7cbf1722 2961
e09ee510
PB
2962 while (node) {
2963 req = container_of(node, struct io_kiocb, io_task_work.node);
2964 node = node->next;
2965 if (llist_add(&req->io_task_work.fallback_node,
2966 &req->ctx->fallback_llist))
2967 schedule_delayed_work(&req->ctx->fallback_work, 1);
2968 }
eab30c4d
PB
2969}
2970
4e118cd9 2971static void io_req_tw_post(struct io_kiocb *req, bool *locked)
c40f6379 2972{
4e118cd9
PB
2973 io_req_complete_post(req, req->cqe.res, req->cqe.flags);
2974}
c40f6379 2975
4e118cd9
PB
2976static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
2977{
2978 req->cqe.res = res;
2979 req->cqe.flags = cflags;
2980 req->io_task_work.func = io_req_tw_post;
2981 io_req_task_work_add(req, false);
2982}
2983
f237c30a 2984static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
c40f6379 2985{
b18a1a45 2986 /* not needed for normal modes, but SQPOLL depends on it */
971cf9c1 2987 io_tw_lock(req->ctx, locked);
cef216fc 2988 io_req_complete_failed(req, req->cqe.res);
c40f6379
JA
2989}
2990
f237c30a 2991static void io_req_task_submit(struct io_kiocb *req, bool *locked)
c40f6379 2992{
971cf9c1 2993 io_tw_lock(req->ctx, locked);
316319e8 2994 /* req->task == current here, checking PF_EXITING is safe */
af066f31 2995 if (likely(!(req->task->flags & PF_EXITING)))
cbc2e203 2996 io_queue_sqe(req);
81b6d05c 2997 else
2593553a 2998 io_req_complete_failed(req, -EFAULT);
c40f6379
JA
2999}
3000
2c4b8eb6 3001static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
c40f6379 3002{
cef216fc 3003 req->cqe.res = ret;
5b0a6acc 3004 req->io_task_work.func = io_req_task_cancel;
4813c377 3005 io_req_task_work_add(req, false);
c40f6379
JA
3006}
3007
2c4b8eb6 3008static void io_req_task_queue(struct io_kiocb *req)
a3df7698 3009{
5b0a6acc 3010 req->io_task_work.func = io_req_task_submit;
4813c377 3011 io_req_task_work_add(req, false);
a3df7698
PB
3012}
3013
773af691
JA
3014static void io_req_task_queue_reissue(struct io_kiocb *req)
3015{
77955efb 3016 req->io_task_work.func = io_queue_iowq;
4813c377 3017 io_req_task_work_add(req, false);
773af691
JA
3018}
3019
57859f4d 3020static void io_queue_next(struct io_kiocb *req)
c69f8dbe 3021{
9b5f7bd9 3022 struct io_kiocb *nxt = io_req_find_next(req);
944e58bf
PB
3023
3024 if (nxt)
906a8c3f 3025 io_req_task_queue(nxt);
c69f8dbe
JL
3026}
3027
3aa83bfb 3028static void io_free_batch_list(struct io_ring_ctx *ctx,
1cce17ac 3029 struct io_wq_work_node *node)
3aa83bfb 3030 __must_hold(&ctx->uring_lock)
5af1d13e 3031{
d4b7a5ef 3032 struct task_struct *task = NULL;
37f0e767 3033 int task_refs = 0;
5af1d13e 3034
3aa83bfb
PB
3035 do {
3036 struct io_kiocb *req = container_of(node, struct io_kiocb,
3037 comp_list);
2d6500d4 3038
a538be5b
PB
3039 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
3040 if (req->flags & REQ_F_REFCOUNT) {
3041 node = req->comp_list.next;
3042 if (!req_ref_put_and_test(req))
3043 continue;
3044 }
b605a7fa
PB
3045 if ((req->flags & REQ_F_POLLED) && req->apoll) {
3046 struct async_poll *apoll = req->apoll;
3047
3048 if (apoll->double_poll)
3049 kfree(apoll->double_poll);
3050 list_add(&apoll->poll.wait.entry,
3051 &ctx->apoll_cache);
3052 req->flags &= ~REQ_F_POLLED;
3053 }
da1a08c5 3054 if (req->flags & IO_REQ_LINK_FLAGS)
57859f4d 3055 io_queue_next(req);
a538be5b
PB
3056 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
3057 io_clean_op(req);
c1e53a69 3058 }
a538be5b
PB
3059 if (!(req->flags & REQ_F_FIXED_FILE))
3060 io_put_file(req->file);
2d6500d4 3061
ab409402 3062 io_req_put_rsrc_locked(req, ctx);
5af1d13e 3063
d4b7a5ef
PB
3064 if (req->task != task) {
3065 if (task)
3066 io_put_task(task, task_refs);
3067 task = req->task;
3068 task_refs = 0;
3069 }
3070 task_refs++;
c1e53a69 3071 node = req->comp_list.next;
fa05457a 3072 io_req_add_to_cache(req, ctx);
3aa83bfb 3073 } while (node);
d4b7a5ef 3074
d4b7a5ef
PB
3075 if (task)
3076 io_put_task(task, task_refs);
7a743e22
PB
3077}
3078
c450178d 3079static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
a141dd89 3080 __must_hold(&ctx->uring_lock)
905c172f 3081{
6f33b0bc 3082 struct io_wq_work_node *node, *prev;
cd0ca2e0 3083 struct io_submit_state *state = &ctx->submit_state;
905c172f 3084
3d4aeb9f
PB
3085 if (state->flush_cqes) {
3086 spin_lock(&ctx->completion_lock);
3087 wq_list_for_each(node, prev, &state->compl_reqs) {
3088 struct io_kiocb *req = container_of(node, struct io_kiocb,
6f33b0bc 3089 comp_list);
5182ed2e 3090
0e2e5c47
SR
3091 if (!(req->flags & REQ_F_CQE_SKIP)) {
3092 if (!(ctx->flags & IORING_SETUP_CQE32))
3093 __io_fill_cqe_req_filled(ctx, req);
3094 else
3095 __io_fill_cqe32_req_filled(ctx, req);
3096 }
3d4aeb9f
PB
3097 }
3098
3099 io_commit_cqring(ctx);
3100 spin_unlock(&ctx->completion_lock);
3101 io_cqring_ev_posted(ctx);
3102 state->flush_cqes = false;
905c172f 3103 }
5182ed2e 3104
1cce17ac 3105 io_free_batch_list(ctx, state->compl_reqs.first);
6f33b0bc 3106 INIT_WQ_LIST(&state->compl_reqs);
7a743e22
PB
3107}
3108
ba816ad6
JA
3109/*
3110 * Drop reference to request, return next in chain (if there is one) if this
3111 * was the last reference to this request.
3112 */
0d85035a 3113static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
e65ef56d 3114{
9b5f7bd9
PB
3115 struct io_kiocb *nxt = NULL;
3116
de9b4cca 3117 if (req_ref_put_and_test(req)) {
da1a08c5 3118 if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
7819a1f6 3119 nxt = io_req_find_next(req);
f5c6cf2a 3120 io_free_req(req);
2a44f467 3121 }
9b5f7bd9 3122 return nxt;
2b188cc1
JA
3123}
3124
0d85035a 3125static inline void io_put_req(struct io_kiocb *req)
216578e5 3126{
91c2f697 3127 if (req_ref_put_and_test(req)) {
f5c6cf2a 3128 io_queue_next(req);
e65ef56d 3129 io_free_req(req);
543af3a1 3130 }
216578e5
PB
3131}
3132
6c503150 3133static unsigned io_cqring_events(struct io_ring_ctx *ctx)
a3a0e43f
JA
3134{
3135 /* See comment at the top of this file */
3136 smp_rmb();
e23de15f 3137 return __io_cqring_events(ctx);
a3a0e43f
JA
3138}
3139
fb5ccc98
PB
3140static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
3141{
3142 struct io_rings *rings = ctx->rings;
3143
3144 /* make sure SQ entry isn't read before tail */
3145 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
3146}
3147
4c6e277c
JA
3148static inline bool io_run_task_work(void)
3149{
7f62d40d 3150 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
4c6e277c 3151 __set_current_state(TASK_RUNNING);
7c5d8fa6
EB
3152 clear_notify_signal();
3153 if (task_work_pending(current))
3154 task_work_run();
4c6e277c
JA
3155 return true;
3156 }
3157
3158 return false;
bcda7baa
JA
3159}
3160
5ba3c874 3161static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
def596e9 3162{
5eef4e87 3163 struct io_wq_work_node *pos, *start, *prev;
d729cf9a 3164 unsigned int poll_flags = BLK_POLL_NOSLEEP;
b688f11e 3165 DEFINE_IO_COMP_BATCH(iob);
5ba3c874 3166 int nr_events = 0;
def596e9
JA
3167
3168 /*
3169 * Only spin for completions if we don't have multiple devices hanging
87a115fb 3170 * off our complete list.
def596e9 3171 */
87a115fb 3172 if (ctx->poll_multi_queue || force_nonspin)
ef99b2d3 3173 poll_flags |= BLK_POLL_ONESHOT;
def596e9 3174
5eef4e87
PB
3175 wq_list_for_each(pos, start, &ctx->iopoll_list) {
3176 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
9adbd45d 3177 struct kiocb *kiocb = &req->rw.kiocb;
a2416e1e 3178 int ret;
def596e9
JA
3179
3180 /*
581f9810
BM
3181 * Move completed and retryable entries to our local lists.
3182 * If we find a request that requires polling, break out
3183 * and complete those lists first, if we have entries there.
def596e9 3184 */
e3f721e6 3185 if (READ_ONCE(req->iopoll_completed))
def596e9
JA
3186 break;
3187
b688f11e 3188 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
a2416e1e
PB
3189 if (unlikely(ret < 0))
3190 return ret;
3191 else if (ret)
ef99b2d3 3192 poll_flags |= BLK_POLL_ONESHOT;
def596e9 3193
3aadc23e 3194 /* iopoll may have completed current req */
b688f11e
JA
3195 if (!rq_list_empty(iob.req_list) ||
3196 READ_ONCE(req->iopoll_completed))
e3f721e6 3197 break;
def596e9
JA
3198 }
3199
b688f11e
JA
3200 if (!rq_list_empty(iob.req_list))
3201 iob.complete(&iob);
5eef4e87
PB
3202 else if (!pos)
3203 return 0;
def596e9 3204
5eef4e87
PB
3205 prev = start;
3206 wq_list_for_each_resume(pos, prev) {
3207 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
3208
b3fa03fd
PB
3209 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
3210 if (!smp_load_acquire(&req->iopoll_completed))
e3f721e6 3211 break;
c0713540 3212 nr_events++;
83a13a41
PB
3213 if (unlikely(req->flags & REQ_F_CQE_SKIP))
3214 continue;
cef216fc 3215 __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
e3f721e6 3216 }
def596e9 3217
f5ed3bcd
PB
3218 if (unlikely(!nr_events))
3219 return 0;
3220
3221 io_commit_cqring(ctx);
3222 io_cqring_ev_posted_iopoll(ctx);
1cce17ac 3223 pos = start ? start->next : ctx->iopoll_list.first;
5eef4e87 3224 wq_list_cut(&ctx->iopoll_list, prev, start);
1cce17ac 3225 io_free_batch_list(ctx, pos);
5ba3c874 3226 return nr_events;
def596e9
JA
3227}
3228
def596e9
JA
3229/*
3230 * We can't just wait for polled events to come to us, we have to actively
3231 * find and complete them.
3232 */
c072481d 3233static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
def596e9
JA
3234{
3235 if (!(ctx->flags & IORING_SETUP_IOPOLL))
3236 return;
3237
3238 mutex_lock(&ctx->uring_lock);
5eef4e87 3239 while (!wq_list_empty(&ctx->iopoll_list)) {
b2edc0a7 3240 /* let it sleep and repeat later if can't complete a request */
5ba3c874 3241 if (io_do_iopoll(ctx, true) == 0)
b2edc0a7 3242 break;
08f5439f
JA
3243 /*
3244 * Ensure we allow local-to-the-cpu processing to take place,
3245 * in this case we need to ensure that we reap all events.
3fcee5a6 3246 * Also let task_work, etc. to progress by releasing the mutex
08f5439f 3247 */
3fcee5a6
PB
3248 if (need_resched()) {
3249 mutex_unlock(&ctx->uring_lock);
3250 cond_resched();
3251 mutex_lock(&ctx->uring_lock);
3252 }
def596e9
JA
3253 }
3254 mutex_unlock(&ctx->uring_lock);
3255}
3256
7668b92a 3257static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
def596e9 3258{
7668b92a 3259 unsigned int nr_events = 0;
e9979b36 3260 int ret = 0;
155bc950 3261 unsigned long check_cq;
500f9fba 3262
f39c8a5b
PB
3263 /*
3264 * Don't enter poll loop if we already have events pending.
3265 * If we do, we can potentially be spinning for commands that
3266 * already triggered a CQE (eg in error).
3267 */
155bc950
DY
3268 check_cq = READ_ONCE(ctx->check_cq);
3269 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
f39c8a5b
PB
3270 __io_cqring_overflow_flush(ctx, false);
3271 if (io_cqring_events(ctx))
d487b43c 3272 return 0;
155bc950
DY
3273
3274 /*
3275 * Similarly do not spin if we have not informed the user of any
3276 * dropped CQE.
3277 */
3278 if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
3279 return -EBADR;
3280
def596e9 3281 do {
500f9fba
JA
3282 /*
3283 * If a submit got punted to a workqueue, we can have the
3284 * application entering polling for a command before it gets
3285 * issued. That app will hold the uring_lock for the duration
3286 * of the poll right here, so we need to take a breather every
3287 * now and then to ensure that the issue has a chance to add
3288 * the poll to the issued list. Otherwise we can spin here
3289 * forever, while the workqueue is stuck trying to acquire the
3290 * very same mutex.
3291 */
5eef4e87 3292 if (wq_list_empty(&ctx->iopoll_list)) {
8f487ef2
PB
3293 u32 tail = ctx->cached_cq_tail;
3294
500f9fba 3295 mutex_unlock(&ctx->uring_lock);
4c6e277c 3296 io_run_task_work();
500f9fba 3297 mutex_lock(&ctx->uring_lock);
def596e9 3298
8f487ef2
PB
3299 /* some requests don't go through iopoll_list */
3300 if (tail != ctx->cached_cq_tail ||
5eef4e87 3301 wq_list_empty(&ctx->iopoll_list))
e9979b36 3302 break;
500f9fba 3303 }
5ba3c874
PB
3304 ret = io_do_iopoll(ctx, !min);
3305 if (ret < 0)
3306 break;
3307 nr_events += ret;
3308 ret = 0;
3309 } while (nr_events < min && !need_resched());
d487b43c 3310
def596e9
JA
3311 return ret;
3312}
3313
491381ce 3314static void kiocb_end_write(struct io_kiocb *req)
2b188cc1 3315{
491381ce
JA
3316 /*
3317 * Tell lockdep we inherited freeze protection from submission
3318 * thread.
3319 */
3320 if (req->flags & REQ_F_ISREG) {
1c98679d 3321 struct super_block *sb = file_inode(req->file)->i_sb;
2b188cc1 3322
1c98679d
PB
3323 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
3324 sb_end_write(sb);
2b188cc1
JA
3325 }
3326}
3327
b63534c4 3328#ifdef CONFIG_BLOCK
dc2a6e9a 3329static bool io_resubmit_prep(struct io_kiocb *req)
b63534c4 3330{
ab454438 3331 struct io_async_rw *rw = req->async_data;
b63534c4 3332
d886e185 3333 if (!req_has_async_data(req))
ab454438 3334 return !io_req_prep_async(req);
538941e2 3335 iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
ab454438 3336 return true;
b63534c4 3337}
b63534c4 3338
3e6a0d3c 3339static bool io_rw_should_reissue(struct io_kiocb *req)
b63534c4 3340{
355afaeb 3341 umode_t mode = file_inode(req->file)->i_mode;
3e6a0d3c 3342 struct io_ring_ctx *ctx = req->ctx;
b63534c4 3343
355afaeb
JA
3344 if (!S_ISBLK(mode) && !S_ISREG(mode))
3345 return false;
3e6a0d3c
JA
3346 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
3347 !(ctx->flags & IORING_SETUP_IOPOLL)))
b63534c4 3348 return false;
7c977a58
JA
3349 /*
3350 * If ref is dying, we might be running poll reap from the exit work.
3351 * Don't attempt to reissue from that path, just let it fail with
3352 * -EAGAIN.
3353 */
3e6a0d3c
JA
3354 if (percpu_ref_is_dying(&ctx->refs))
3355 return false;
ef046888
JA
3356 /*
3357 * Play it safe and assume not safe to re-import and reissue if we're
3358 * not in the original thread group (or in task context).
3359 */
3360 if (!same_thread_group(req->task, current) || !in_task())
3361 return false;
3e6a0d3c
JA
3362 return true;
3363}
e82ad485 3364#else
a1ff1e3f 3365static bool io_resubmit_prep(struct io_kiocb *req)
e82ad485
JA
3366{
3367 return false;
3368}
e82ad485 3369static bool io_rw_should_reissue(struct io_kiocb *req)
3e6a0d3c 3370{
b63534c4
JA
3371 return false;
3372}
3e6a0d3c 3373#endif
b63534c4 3374
8ef12efe 3375static bool __io_complete_rw_common(struct io_kiocb *req, long res)
a1d7c393 3376{
f63cf519 3377 if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
b65c128f 3378 kiocb_end_write(req);
f63cf519
JA
3379 fsnotify_modify(req->file);
3380 } else {
3381 fsnotify_access(req->file);
3382 }
cef216fc 3383 if (unlikely(res != req->cqe.res)) {
9532b99b
PB
3384 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
3385 io_rw_should_reissue(req)) {
3386 req->flags |= REQ_F_REISSUE;
8ef12efe 3387 return true;
9532b99b 3388 }
93d2bcd2 3389 req_set_fail(req);
cef216fc 3390 req->cqe.res = res;
9532b99b 3391 }
8ef12efe
JA
3392 return false;
3393}
3394
cc8e9ba7 3395static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
8ef12efe 3396{
cef216fc 3397 int res = req->cqe.res;
126180b9
PB
3398
3399 if (*locked) {
cc3cec83 3400 io_req_complete_state(req, res, io_put_kbuf(req, 0));
fff4e40e 3401 io_req_add_compl_list(req);
126180b9 3402 } else {
cc3cec83
JA
3403 io_req_complete_post(req, res,
3404 io_put_kbuf(req, IO_URING_F_UNLOCKED));
126180b9 3405 }
8ef12efe
JA
3406}
3407
00f6e68b 3408static void __io_complete_rw(struct io_kiocb *req, long res,
8ef12efe
JA
3409 unsigned int issue_flags)
3410{
3411 if (__io_complete_rw_common(req, res))
3412 return;
cef216fc 3413 __io_req_complete(req, issue_flags, req->cqe.res,
cc3cec83 3414 io_put_kbuf(req, issue_flags));
ba816ad6
JA
3415}
3416
6b19b766 3417static void io_complete_rw(struct kiocb *kiocb, long res)
ba816ad6 3418{
9adbd45d 3419 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
ba816ad6 3420
8ef12efe
JA
3421 if (__io_complete_rw_common(req, res))
3422 return;
cef216fc 3423 req->cqe.res = res;
8ef12efe 3424 req->io_task_work.func = io_req_task_complete;
f28c240e 3425 io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
2b188cc1
JA
3426}
3427
6b19b766 3428static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
def596e9 3429{
9adbd45d 3430 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
def596e9 3431
491381ce
JA
3432 if (kiocb->ki_flags & IOCB_WRITE)
3433 kiocb_end_write(req);
cef216fc 3434 if (unlikely(res != req->cqe.res)) {
b66ceaf3
PB
3435 if (res == -EAGAIN && io_rw_should_reissue(req)) {
3436 req->flags |= REQ_F_REISSUE;
3437 return;
9532b99b 3438 }
cef216fc 3439 req->cqe.res = res;
8c130827 3440 }
bbde017a 3441
b3fa03fd
PB
3442 /* order with io_iopoll_complete() checking ->iopoll_completed */
3443 smp_store_release(&req->iopoll_completed, 1);
def596e9
JA
3444}
3445
3446/*
3447 * After the iocb has been issued, it's safe to be found on the poll list.
3448 * Adding the kiocb to the list AFTER submission ensures that we don't
f39c8a5b 3449 * find it from a io_do_iopoll() thread before the issuer is done
def596e9
JA
3450 * accessing the kiocb cookie.
3451 */
9882131c 3452static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
def596e9
JA
3453{
3454 struct io_ring_ctx *ctx = req->ctx;
3b44b371 3455 const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
cb3d8972
PB
3456
3457 /* workqueue context doesn't hold uring_lock, grab it now */
3b44b371 3458 if (unlikely(needs_lock))
cb3d8972 3459 mutex_lock(&ctx->uring_lock);
def596e9
JA
3460
3461 /*
3462 * Track whether we have multiple files in our lists. This will impact
3463 * how we do polling eventually, not spinning if we're on potentially
3464 * different devices.
3465 */
5eef4e87 3466 if (wq_list_empty(&ctx->iopoll_list)) {
915b3dde
HX
3467 ctx->poll_multi_queue = false;
3468 } else if (!ctx->poll_multi_queue) {
def596e9
JA
3469 struct io_kiocb *list_req;
3470
5eef4e87
PB
3471 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
3472 comp_list);
30da1b45 3473 if (list_req->file != req->file)
915b3dde 3474 ctx->poll_multi_queue = true;
def596e9
JA
3475 }
3476
3477 /*
3478 * For fast devices, IO may have already completed. If it has, add
3479 * it to the front so we find it first.
3480 */
65a6543d 3481 if (READ_ONCE(req->iopoll_completed))
5eef4e87 3482 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
def596e9 3483 else
5eef4e87 3484 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
bdcd3eab 3485
3b44b371 3486 if (unlikely(needs_lock)) {
cb3d8972
PB
3487 /*
3488 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
3489 * in sq thread task context or in io worker task context. If
3490 * current task context is sq thread, we don't need to check
3491 * whether should wake up sq thread.
3492 */
3493 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
3494 wq_has_sleeper(&ctx->sq_data->wait))
3495 wake_up(&ctx->sq_data->wait);
3496
3497 mutex_unlock(&ctx->uring_lock);
3498 }
def596e9
JA
3499}
3500
4503b767
JA
3501static bool io_bdev_nowait(struct block_device *bdev)
3502{
9ba0d0c8 3503 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
4503b767
JA
3504}
3505
2b188cc1
JA
3506/*
3507 * If we tracked the file through the SCM inflight mechanism, we could support
3508 * any file. For now, just ensure that anything potentially problematic is done
3509 * inline.
3510 */
88459b50 3511static bool __io_file_supports_nowait(struct file *file, umode_t mode)
2b188cc1 3512{
4503b767 3513 if (S_ISBLK(mode)) {
4e7b5671
CH
3514 if (IS_ENABLED(CONFIG_BLOCK) &&
3515 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
4503b767
JA
3516 return true;
3517 return false;
3518 }
976517f1 3519 if (S_ISSOCK(mode))
2b188cc1 3520 return true;
4503b767 3521 if (S_ISREG(mode)) {
4e7b5671
CH
3522 if (IS_ENABLED(CONFIG_BLOCK) &&
3523 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
4503b767
JA
3524 file->f_op != &io_uring_fops)
3525 return true;
3526 return false;
3527 }
2b188cc1 3528
c5b85625
JA
3529 /* any ->read/write should understand O_NONBLOCK */
3530 if (file->f_flags & O_NONBLOCK)
3531 return true;
35645ac3 3532 return file->f_mode & FMODE_NOWAIT;
2b188cc1 3533}
c5b85625 3534
88459b50
PB
3535/*
3536 * If we tracked the file through the SCM inflight mechanism, we could support
3537 * any file. For now, just ensure that anything potentially problematic is done
3538 * inline.
3539 */
3540static unsigned int io_file_get_flags(struct file *file)
3541{
3542 umode_t mode = file_inode(file)->i_mode;
3543 unsigned int res = 0;
af197f50 3544
88459b50
PB
3545 if (S_ISREG(mode))
3546 res |= FFS_ISREG;
3547 if (__io_file_supports_nowait(file, mode))
3548 res |= FFS_NOWAIT;
5e45690a
JA
3549 if (io_file_need_scm(file))
3550 res |= FFS_SCM;
88459b50 3551 return res;
2b188cc1
JA
3552}
3553
35645ac3 3554static inline bool io_file_supports_nowait(struct io_kiocb *req)
7b29f92d 3555{
88459b50 3556 return req->flags & REQ_F_SUPPORT_NOWAIT;
7b29f92d
JA
3557}
3558
b9a6b8f9 3559static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2b188cc1 3560{
9adbd45d 3561 struct kiocb *kiocb = &req->rw.kiocb;
09bb8394
JA
3562 unsigned ioprio;
3563 int ret;
2b188cc1 3564
2b188cc1 3565 kiocb->ki_pos = READ_ONCE(sqe->off);
9adbd45d 3566
fb27274a
PB
3567 ioprio = READ_ONCE(sqe->ioprio);
3568 if (ioprio) {
3569 ret = ioprio_check_cap(ioprio);
3570 if (ret)
3571 return ret;
3572
3573 kiocb->ki_ioprio = ioprio;
3574 } else {
3575 kiocb->ki_ioprio = get_current_ioprio();
eae071c9
PB
3576 }
3577
578c0ee2 3578 req->imu = NULL;
3529d8c2
JA
3579 req->rw.addr = READ_ONCE(sqe->addr);
3580 req->rw.len = READ_ONCE(sqe->len);
584b0180 3581 req->rw.flags = READ_ONCE(sqe->rw_flags);
4e906702 3582 /* used for fixed read/write too - just read unconditionally */
4f4eeba8 3583 req->buf_index = READ_ONCE(sqe->buf_index);
2b188cc1 3584 return 0;
2b188cc1
JA
3585}
3586
3587static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
3588{
3589 switch (ret) {
3590 case -EIOCBQUEUED:
3591 break;
3592 case -ERESTARTSYS:
3593 case -ERESTARTNOINTR:
3594 case -ERESTARTNOHAND:
3595 case -ERESTART_RESTARTBLOCK:
3596 /*
3597 * We can't just restart the syscall, since previously
3598 * submitted sqes may already be in progress. Just fail this
3599 * IO with EINTR.
3600 */
3601 ret = -EINTR;
df561f66 3602 fallthrough;
2b188cc1 3603 default:
6b19b766 3604 kiocb->ki_complete(kiocb, ret);
2b188cc1
JA
3605 }
3606}
3607
b4aec400 3608static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
d34e1e5b
DY
3609{
3610 struct kiocb *kiocb = &req->rw.kiocb;
3611
6f83ab22
JA
3612 if (kiocb->ki_pos != -1)
3613 return &kiocb->ki_pos;
3614
3615 if (!(req->file->f_mode & FMODE_STREAM)) {
3616 req->flags |= REQ_F_CUR_POS;
3617 kiocb->ki_pos = req->file->f_pos;
3618 return &kiocb->ki_pos;
d34e1e5b 3619 }
6f83ab22
JA
3620
3621 kiocb->ki_pos = 0;
3622 return NULL;
d34e1e5b
DY
3623}
3624
2ea537ca 3625static void kiocb_done(struct io_kiocb *req, ssize_t ret,
889fca73 3626 unsigned int issue_flags)
ba816ad6 3627{
e8c2bc1f 3628 struct io_async_rw *io = req->async_data;
ba04291e 3629
227c0c96 3630 /* add previously done IO, if any */
d886e185 3631 if (req_has_async_data(req) && io->bytes_done > 0) {
227c0c96 3632 if (ret < 0)
e8c2bc1f 3633 ret = io->bytes_done;
227c0c96 3634 else
e8c2bc1f 3635 ret += io->bytes_done;
227c0c96
JA
3636 }
3637
ba04291e 3638 if (req->flags & REQ_F_CUR_POS)
2ea537ca
PB
3639 req->file->f_pos = req->rw.kiocb.ki_pos;
3640 if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
00f6e68b 3641 __io_complete_rw(req, ret, issue_flags);
ba816ad6 3642 else
2ea537ca 3643 io_rw_done(&req->rw.kiocb, ret);
97284637 3644
b66ceaf3 3645 if (req->flags & REQ_F_REISSUE) {
97284637 3646 req->flags &= ~REQ_F_REISSUE;
b91ef187 3647 if (io_resubmit_prep(req))
773af691 3648 io_req_task_queue_reissue(req);
b91ef187
PB
3649 else
3650 io_req_task_queue_fail(req, ret);
97284637 3651 }
ba816ad6
JA
3652}
3653
eae071c9
PB
3654static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3655 struct io_mapped_ubuf *imu)
edafccee 3656{
9adbd45d 3657 size_t len = req->rw.len;
75769e3f 3658 u64 buf_end, buf_addr = req->rw.addr;
edafccee 3659 size_t offset;
edafccee 3660
75769e3f 3661 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
edafccee
JA
3662 return -EFAULT;
3663 /* not inside the mapped region */
4751f53d 3664 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
edafccee
JA
3665 return -EFAULT;
3666
3667 /*
3668 * May not be a start of buffer, set size appropriately
3669 * and advance us to the beginning.
3670 */
3671 offset = buf_addr - imu->ubuf;
3672 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
bd11b3a3
JA
3673
3674 if (offset) {
3675 /*
3676 * Don't use iov_iter_advance() here, as it's really slow for
3677 * using the latter parts of a big fixed buffer - it iterates
3678 * over each segment manually. We can cheat a bit here, because
3679 * we know that:
3680 *
3681 * 1) it's a BVEC iter, we set it up
3682 * 2) all bvecs are PAGE_SIZE in size, except potentially the
3683 * first and last bvec
3684 *
3685 * So just find our index, and adjust the iterator afterwards.
3686 * If the offset is within the first bvec (or the whole first
3687 * bvec, just use iov_iter_advance(). This makes it easier
3688 * since we can just skip the first segment, which may not
3689 * be PAGE_SIZE aligned.
3690 */
3691 const struct bio_vec *bvec = imu->bvec;
3692
3693 if (offset <= bvec->bv_len) {
3694 iov_iter_advance(iter, offset);
3695 } else {
3696 unsigned long seg_skip;
3697
3698 /* skip first vec */
3699 offset -= bvec->bv_len;
3700 seg_skip = 1 + (offset >> PAGE_SHIFT);
3701
3702 iter->bvec = bvec + seg_skip;
3703 iter->nr_segs -= seg_skip;
99c79f66 3704 iter->count -= bvec->bv_len + offset;
bd11b3a3 3705 iter->iov_offset = offset & ~PAGE_MASK;
bd11b3a3
JA
3706 }
3707 }
3708
847595de 3709 return 0;
edafccee
JA
3710}
3711
5106dd6e
JA
3712static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3713 unsigned int issue_flags)
eae071c9 3714{
eae071c9
PB
3715 struct io_mapped_ubuf *imu = req->imu;
3716 u16 index, buf_index = req->buf_index;
3717
3718 if (likely(!imu)) {
578c0ee2
PB
3719 struct io_ring_ctx *ctx = req->ctx;
3720
eae071c9
PB
3721 if (unlikely(buf_index >= ctx->nr_user_bufs))
3722 return -EFAULT;
5106dd6e 3723 io_req_set_rsrc_node(req, ctx, issue_flags);
eae071c9
PB
3724 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3725 imu = READ_ONCE(ctx->user_bufs[index]);
3726 req->imu = imu;
3727 }
3728 return __io_import_fixed(req, rw, iter, imu);
3729}
3730
9cfc7e94
JA
3731static int io_buffer_add_list(struct io_ring_ctx *ctx,
3732 struct io_buffer_list *bl, unsigned int bgid)
bcda7baa 3733{
dbc7d452 3734 bl->bgid = bgid;
9cfc7e94
JA
3735 if (bgid < BGID_ARRAY)
3736 return 0;
bcda7baa 3737
9cfc7e94 3738 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
bcda7baa
JA
3739}
3740
149c69b0
JA
3741static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
3742 struct io_buffer_list *bl,
3743 unsigned int issue_flags)
dbc7d452 3744{
149c69b0 3745 struct io_buffer *kbuf;
dbc7d452 3746
149c69b0
JA
3747 if (list_empty(&bl->buf_list))
3748 return ERR_PTR(-ENOBUFS);
3749
3750 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
3751 list_del(&kbuf->list);
3752 if (*len > kbuf->len)
3753 *len = kbuf->len;
3754 req->flags |= REQ_F_BUFFER_SELECTED;
3755 req->kbuf = kbuf;
1dbd023e 3756 req->buf_index = kbuf->bid;
149c69b0
JA
3757 io_ring_submit_unlock(req->ctx, issue_flags);
3758 return u64_to_user_ptr(kbuf->addr);
dbc7d452
JA
3759}
3760
c54d52c2 3761static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
4e906702 3762 unsigned int issue_flags)
bcda7baa 3763{
dbc7d452
JA
3764 struct io_ring_ctx *ctx = req->ctx;
3765 struct io_buffer_list *bl;
bcda7baa 3766
f8929630 3767 io_ring_submit_lock(req->ctx, issue_flags);
bcda7baa 3768
4e906702 3769 bl = io_buffer_get_list(ctx, req->buf_index);
149c69b0 3770 if (unlikely(!bl)) {
c54d52c2 3771 io_ring_submit_unlock(req->ctx, issue_flags);
149c69b0 3772 return ERR_PTR(-ENOBUFS);
bcda7baa
JA
3773 }
3774
149c69b0
JA
3775 /* selection helpers drop the submit lock again, if needed */
3776 return io_provided_buffer_select(req, len, bl, issue_flags);
4d954c25
JA
3777}
3778
3779#ifdef CONFIG_COMPAT
3780static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
51aac424 3781 unsigned int issue_flags)
4d954c25
JA
3782{
3783 struct compat_iovec __user *uiov;
3784 compat_ssize_t clen;
3785 void __user *buf;
e5b00349 3786 size_t len;
4d954c25
JA
3787
3788 uiov = u64_to_user_ptr(req->rw.addr);
3789 if (!access_ok(uiov, sizeof(*uiov)))
3790 return -EFAULT;
3791 if (__get_user(clen, &uiov->iov_len))
3792 return -EFAULT;
3793 if (clen < 0)
3794 return -EINVAL;
3795
3796 len = clen;
4e906702 3797 buf = io_buffer_select(req, &len, issue_flags);
4d954c25
JA
3798 if (IS_ERR(buf))
3799 return PTR_ERR(buf);
b66e65f4 3800 req->rw.addr = (unsigned long) buf;
4d954c25 3801 iov[0].iov_base = buf;
b66e65f4 3802 req->rw.len = iov[0].iov_len = (compat_size_t) len;
4d954c25
JA
3803 return 0;
3804}
3805#endif
3806
3807static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
51aac424 3808 unsigned int issue_flags)
4d954c25
JA
3809{
3810 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3811 void __user *buf;
3812 ssize_t len;
3813
3814 if (copy_from_user(iov, uiov, sizeof(*uiov)))
3815 return -EFAULT;
3816
3817 len = iov[0].iov_len;
3818 if (len < 0)
3819 return -EINVAL;
4e906702 3820 buf = io_buffer_select(req, &len, issue_flags);
4d954c25
JA
3821 if (IS_ERR(buf))
3822 return PTR_ERR(buf);
b66e65f4 3823 req->rw.addr = (unsigned long) buf;
4d954c25 3824 iov[0].iov_base = buf;
b66e65f4 3825 req->rw.len = iov[0].iov_len = len;
4d954c25
JA
3826 return 0;
3827}
3828
3829static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
51aac424 3830 unsigned int issue_flags)
4d954c25 3831{
dddb3e26 3832 if (req->flags & REQ_F_BUFFER_SELECTED) {
b66e65f4
JA
3833 iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
3834 iov[0].iov_len = req->rw.len;
4d954c25 3835 return 0;
dddb3e26 3836 }
dd201662 3837 if (req->rw.len != 1)
4d954c25
JA
3838 return -EINVAL;
3839
3840#ifdef CONFIG_COMPAT
3841 if (req->ctx->compat)
51aac424 3842 return io_compat_import(req, iov, issue_flags);
4d954c25
JA
3843#endif
3844
51aac424 3845 return __io_iov_buffer_select(req, iov, issue_flags);
4d954c25
JA
3846}
3847
b66e65f4
JA
3848static inline bool io_do_buffer_select(struct io_kiocb *req)
3849{
3850 if (!(req->flags & REQ_F_BUFFER_SELECT))
3851 return false;
3852 return !(req->flags & REQ_F_BUFFER_SELECTED);
3853}
3854
caa8fe6e
PB
3855static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
3856 struct io_rw_state *s,
3857 unsigned int issue_flags)
2b188cc1 3858{
5e49c973 3859 struct iov_iter *iter = &s->iter;
847595de 3860 u8 opcode = req->opcode;
caa8fe6e 3861 struct iovec *iovec;
d1d681b0
PB
3862 void __user *buf;
3863 size_t sqe_len;
4d954c25 3864 ssize_t ret;
edafccee 3865
f3251183 3866 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
5106dd6e 3867 ret = io_import_fixed(req, rw, iter, issue_flags);
f3251183
PB
3868 if (ret)
3869 return ERR_PTR(ret);
3870 return NULL;
3871 }
2b188cc1 3872
d1d681b0
PB
3873 buf = u64_to_user_ptr(req->rw.addr);
3874 sqe_len = req->rw.len;
9adbd45d 3875
3a6820f2 3876 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
b66e65f4 3877 if (io_do_buffer_select(req)) {
4e906702 3878 buf = io_buffer_select(req, &sqe_len, issue_flags);
867a23ea 3879 if (IS_ERR(buf))
898df244 3880 return ERR_CAST(buf);
b66e65f4 3881 req->rw.addr = (unsigned long) buf;
3f9d6441 3882 req->rw.len = sqe_len;
bcda7baa
JA
3883 }
3884
5e49c973 3885 ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
f3251183
PB
3886 if (ret)
3887 return ERR_PTR(ret);
3888 return NULL;
3a6820f2
JA
3889 }
3890
caa8fe6e 3891 iovec = s->fast_iov;
4d954c25 3892 if (req->flags & REQ_F_BUFFER_SELECT) {
caa8fe6e 3893 ret = io_iov_buffer_select(req, iovec, issue_flags);
f3251183
PB
3894 if (ret)
3895 return ERR_PTR(ret);
3896 iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
3897 return NULL;
4d954c25
JA
3898 }
3899
caa8fe6e 3900 ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
89cd35c5 3901 req->ctx->compat);
caa8fe6e
PB
3902 if (unlikely(ret < 0))
3903 return ERR_PTR(ret);
3904 return iovec;
2b188cc1
JA
3905}
3906
5e49c973
PB
3907static inline int io_import_iovec(int rw, struct io_kiocb *req,
3908 struct iovec **iovec, struct io_rw_state *s,
3909 unsigned int issue_flags)
3910{
caa8fe6e
PB
3911 *iovec = __io_import_iovec(rw, req, s, issue_flags);
3912 if (unlikely(IS_ERR(*iovec)))
3913 return PTR_ERR(*iovec);
5e49c973 3914
5e49c973 3915 iov_iter_save_state(&s->iter, &s->iter_state);
caa8fe6e 3916 return 0;
2b188cc1
JA
3917}
3918
0fef9483
JA
3919static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3920{
5b09e37e 3921 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
0fef9483
JA
3922}
3923
31b51510 3924/*
32960613
JA
3925 * For files that don't have ->read_iter() and ->write_iter(), handle them
3926 * by looping over ->read() or ->write() manually.
31b51510 3927 */
4017eb91 3928static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
32960613 3929{
4017eb91
JA
3930 struct kiocb *kiocb = &req->rw.kiocb;
3931 struct file *file = req->file;
32960613 3932 ssize_t ret = 0;
af9c45ec 3933 loff_t *ppos;
32960613
JA
3934
3935 /*
3936 * Don't support polled IO through this interface, and we can't
3937 * support non-blocking either. For the latter, this just causes
3938 * the kiocb to be handled from an async context.
3939 */
3940 if (kiocb->ki_flags & IOCB_HIPRI)
3941 return -EOPNOTSUPP;
35645ac3
PB
3942 if ((kiocb->ki_flags & IOCB_NOWAIT) &&
3943 !(kiocb->ki_filp->f_flags & O_NONBLOCK))
32960613
JA
3944 return -EAGAIN;
3945
af9c45ec
DY
3946 ppos = io_kiocb_ppos(kiocb);
3947
32960613 3948 while (iov_iter_count(iter)) {
311ae9e1 3949 struct iovec iovec;
32960613
JA
3950 ssize_t nr;
3951
311ae9e1
PB
3952 if (!iov_iter_is_bvec(iter)) {
3953 iovec = iov_iter_iovec(iter);
3954 } else {
4017eb91
JA
3955 iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3956 iovec.iov_len = req->rw.len;
311ae9e1
PB
3957 }
3958
32960613
JA
3959 if (rw == READ) {
3960 nr = file->f_op->read(file, iovec.iov_base,
af9c45ec 3961 iovec.iov_len, ppos);
32960613
JA
3962 } else {
3963 nr = file->f_op->write(file, iovec.iov_base,
af9c45ec 3964 iovec.iov_len, ppos);
32960613
JA
3965 }
3966
3967 if (nr < 0) {
3968 if (!ret)
3969 ret = nr;
3970 break;
3971 }
5e929367 3972 ret += nr;
16c8d2df
JA
3973 if (!iov_iter_is_bvec(iter)) {
3974 iov_iter_advance(iter, nr);
3975 } else {
16c8d2df 3976 req->rw.addr += nr;
5e929367
JA
3977 req->rw.len -= nr;
3978 if (!req->rw.len)
3979 break;
16c8d2df 3980 }
32960613
JA
3981 if (nr != iovec.iov_len)
3982 break;
32960613
JA
3983 }
3984
3985 return ret;
3986}
3987
ff6165b2
JA
3988static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3989 const struct iovec *fast_iov, struct iov_iter *iter)
f67676d1 3990{
e8c2bc1f 3991 struct io_async_rw *rw = req->async_data;
b64e3444 3992
538941e2 3993 memcpy(&rw->s.iter, iter, sizeof(*iter));
afb87658 3994 rw->free_iovec = iovec;
227c0c96 3995 rw->bytes_done = 0;
ff6165b2 3996 /* can only be fixed buffers, no need to do anything */
9c3a205c 3997 if (iov_iter_is_bvec(iter))
ff6165b2 3998 return;
b64e3444 3999 if (!iovec) {
ff6165b2
JA
4000 unsigned iov_off = 0;
4001
538941e2 4002 rw->s.iter.iov = rw->s.fast_iov;
ff6165b2
JA
4003 if (iter->iov != fast_iov) {
4004 iov_off = iter->iov - fast_iov;
538941e2 4005 rw->s.iter.iov += iov_off;
ff6165b2 4006 }
538941e2
PB
4007 if (rw->s.fast_iov != fast_iov)
4008 memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
45097dae 4009 sizeof(struct iovec) * iter->nr_segs);
99bc4c38
PB
4010 } else {
4011 req->flags |= REQ_F_NEED_CLEANUP;
f67676d1
JA
4012 }
4013}
4014
8d4af685 4015static inline bool io_alloc_async_data(struct io_kiocb *req)
3d9932a8 4016{
e8c2bc1f
JA
4017 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
4018 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
d886e185
PB
4019 if (req->async_data) {
4020 req->flags |= REQ_F_ASYNC_DATA;
4021 return false;
4022 }
4023 return true;
3d9932a8
XW
4024}
4025
ff6165b2 4026static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
c88598a9 4027 struct io_rw_state *s, bool force)
b7bb4f7d 4028{
26f0505a 4029 if (!force && !io_op_defs[req->opcode].needs_async_setup)
74566df3 4030 return 0;
d886e185 4031 if (!req_has_async_data(req)) {
cd658695
JA
4032 struct io_async_rw *iorw;
4033
6cb78689 4034 if (io_alloc_async_data(req)) {
6bf985dc 4035 kfree(iovec);
5d204bcf 4036 return -ENOMEM;
6bf985dc 4037 }
b7bb4f7d 4038
c88598a9 4039 io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
cd658695
JA
4040 iorw = req->async_data;
4041 /* we've copied and mapped the iter, ensure state is saved */
538941e2 4042 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
5d204bcf 4043 }
b7bb4f7d 4044 return 0;
f67676d1
JA
4045}
4046
73debe68 4047static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
c3e330a4 4048{
e8c2bc1f 4049 struct io_async_rw *iorw = req->async_data;
5e49c973 4050 struct iovec *iov;
847595de 4051 int ret;
c3e330a4 4052
51aac424 4053 /* submission path, ->uring_lock should already be taken */
3b44b371 4054 ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
c3e330a4
PB
4055 if (unlikely(ret < 0))
4056 return ret;
4057
ab0b196c
PB
4058 iorw->bytes_done = 0;
4059 iorw->free_iovec = iov;
4060 if (iov)
4061 req->flags |= REQ_F_NEED_CLEANUP;
c3e330a4
PB
4062 return 0;
4063}
4064
c1dd91d1 4065/*
ffdc8dab 4066 * This is our waitqueue callback handler, registered through __folio_lock_async()
c1dd91d1
JA
4067 * when we initially tried to do the IO with the iocb armed our waitqueue.
4068 * This gets called when the page is unlocked, and we generally expect that to
4069 * happen when the page IO is completed and the page is now uptodate. This will
4070 * queue a task_work based retry of the operation, attempting to copy the data
4071 * again. If the latter fails because the page was NOT uptodate, then we will
4072 * do a thread based blocking retry of the operation. That's the unexpected
4073 * slow path.
4074 */
bcf5a063
JA
4075static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
4076 int sync, void *arg)
4077{
4078 struct wait_page_queue *wpq;
4079 struct io_kiocb *req = wait->private;
bcf5a063 4080 struct wait_page_key *key = arg;
bcf5a063
JA
4081
4082 wpq = container_of(wait, struct wait_page_queue, wait);
4083
cdc8fcb4
LT
4084 if (!wake_page_match(wpq, key))
4085 return 0;
4086
c8d317aa 4087 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
bcf5a063 4088 list_del_init(&wait->entry);
921b9054 4089 io_req_task_queue(req);
bcf5a063
JA
4090 return 1;
4091}
4092
c1dd91d1
JA
4093/*
4094 * This controls whether a given IO request should be armed for async page
4095 * based retry. If we return false here, the request is handed to the async
4096 * worker threads for retry. If we're doing buffered reads on a regular file,
4097 * we prepare a private wait_page_queue entry and retry the operation. This
4098 * will either succeed because the page is now uptodate and unlocked, or it
4099 * will register a callback when the page is unlocked at IO completion. Through
4100 * that callback, io_uring uses task_work to setup a retry of the operation.
4101 * That retry will attempt the buffered read again. The retry will generally
4102 * succeed, or in rare cases where it fails, we then fall back to using the
4103 * async worker threads for a blocking retry.
4104 */
227c0c96 4105static bool io_rw_should_retry(struct io_kiocb *req)
f67676d1 4106{
e8c2bc1f
JA
4107 struct io_async_rw *rw = req->async_data;
4108 struct wait_page_queue *wait = &rw->wpq;
bcf5a063 4109 struct kiocb *kiocb = &req->rw.kiocb;
f67676d1 4110
bcf5a063
JA
4111 /* never retry for NOWAIT, we just complete with -EAGAIN */
4112 if (req->flags & REQ_F_NOWAIT)
4113 return false;
f67676d1 4114
227c0c96 4115 /* Only for buffered IO */
3b2a4439 4116 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
bcf5a063 4117 return false;
3b2a4439 4118
bcf5a063
JA
4119 /*
4120 * just use poll if we can, and don't attempt if the fs doesn't
4121 * support callback based unlocks
4122 */
4123 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
4124 return false;
f67676d1 4125
3b2a4439
JA
4126 wait->wait.func = io_async_buf_func;
4127 wait->wait.private = req;
4128 wait->wait.flags = 0;
4129 INIT_LIST_HEAD(&wait->wait.entry);
4130 kiocb->ki_flags |= IOCB_WAITQ;
c8d317aa 4131 kiocb->ki_flags &= ~IOCB_NOWAIT;
3b2a4439 4132 kiocb->ki_waitq = wait;
3b2a4439 4133 return true;
bcf5a063
JA
4134}
4135
aeab9506 4136static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
bcf5a063 4137{
607b6fb8 4138 if (likely(req->file->f_op->read_iter))
bcf5a063 4139 return call_read_iter(req->file, &req->rw.kiocb, iter);
2dd2111d 4140 else if (req->file->f_op->read)
4017eb91 4141 return loop_rw_iter(READ, req, iter);
2dd2111d
GH
4142 else
4143 return -EINVAL;
f67676d1
JA
4144}
4145
7db30437
ML
4146static bool need_read_all(struct io_kiocb *req)
4147{
4148 return req->flags & REQ_F_ISREG ||
4149 S_ISBLK(file_inode(req->file)->i_mode);
4150}
4151
584b0180
JA
4152static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
4153{
4154 struct kiocb *kiocb = &req->rw.kiocb;
4155 struct io_ring_ctx *ctx = req->ctx;
4156 struct file *file = req->file;
4157 int ret;
4158
4159 if (unlikely(!file || !(file->f_mode & mode)))
4160 return -EBADF;
4161
4162 if (!io_req_ffs_set(req))
4163 req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
4164
4165 kiocb->ki_flags = iocb_flags(file);
4166 ret = kiocb_set_rw_flags(kiocb, req->rw.flags);
4167 if (unlikely(ret))
4168 return ret;
4169
4170 /*
4171 * If the file is marked O_NONBLOCK, still allow retry for it if it
4172 * supports async. Otherwise it's impossible to use O_NONBLOCK files
4173 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
4174 */
4175 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
4176 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
4177 req->flags |= REQ_F_NOWAIT;
4178
4179 if (ctx->flags & IORING_SETUP_IOPOLL) {
4180 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
4181 return -EOPNOTSUPP;
4182
32452a3e 4183 kiocb->private = NULL;
584b0180
JA
4184 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
4185 kiocb->ki_complete = io_complete_rw_iopoll;
4186 req->iopoll_completed = 0;
4187 } else {
4188 if (kiocb->ki_flags & IOCB_HIPRI)
4189 return -EINVAL;
4190 kiocb->ki_complete = io_complete_rw;
4191 }
4192
4193 return 0;
4194}
4195
889fca73 4196static int io_read(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 4197{
607b6fb8 4198 struct io_rw_state __s, *s = &__s;
c88598a9 4199 struct iovec *iovec;
9adbd45d 4200 struct kiocb *kiocb = &req->rw.kiocb;
45d189c6 4201 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
d886e185 4202 struct io_async_rw *rw;
cd658695 4203 ssize_t ret, ret2;
b4aec400 4204 loff_t *ppos;
ff6165b2 4205
607b6fb8
PB
4206 if (!req_has_async_data(req)) {
4207 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
4208 if (unlikely(ret < 0))
4209 return ret;
4210 } else {
2be2eb02
JA
4211 /*
4212 * Safe and required to re-import if we're using provided
4213 * buffers, as we dropped the selected one before retry.
4214 */
4215 if (req->flags & REQ_F_BUFFER_SELECT) {
4216 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
4217 if (unlikely(ret < 0))
4218 return ret;
4219 }
4220
d886e185 4221 rw = req->async_data;
c88598a9 4222 s = &rw->s;
cd658695
JA
4223 /*
4224 * We come here from an earlier attempt, restore our state to
4225 * match in case it doesn't. It's cheap enough that we don't
4226 * need to make this conditional.
4227 */
c88598a9 4228 iov_iter_restore(&s->iter, &s->iter_state);
2846c481 4229 iovec = NULL;
2846c481 4230 }
584b0180 4231 ret = io_rw_init_file(req, FMODE_READ);
323b190b
JA
4232 if (unlikely(ret)) {
4233 kfree(iovec);
584b0180 4234 return ret;
323b190b 4235 }
cef216fc 4236 req->cqe.res = iov_iter_count(&s->iter);
2b188cc1 4237
607b6fb8
PB
4238 if (force_nonblock) {
4239 /* If the file doesn't support async, just async punt */
35645ac3 4240 if (unlikely(!io_file_supports_nowait(req))) {
607b6fb8
PB
4241 ret = io_setup_async_rw(req, iovec, s, true);
4242 return ret ?: -EAGAIN;
4243 }
a88fc400 4244 kiocb->ki_flags |= IOCB_NOWAIT;
607b6fb8
PB
4245 } else {
4246 /* Ensure we clear previously set non-block flag */
4247 kiocb->ki_flags &= ~IOCB_NOWAIT;
6713e7a6 4248 }
9e645e11 4249
b4aec400 4250 ppos = io_kiocb_update_pos(req);
d34e1e5b 4251
cef216fc 4252 ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
5ea5dd45
PB
4253 if (unlikely(ret)) {
4254 kfree(iovec);
4255 return ret;
4256 }
2b188cc1 4257
c88598a9 4258 ret = io_iter_do_read(req, &s->iter);
32960613 4259
230d50d4 4260 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
6ad7f233 4261 req->flags &= ~REQ_F_REISSUE;
9af177ee
JA
4262 /* if we can poll, just do that */
4263 if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
4264 return -EAGAIN;
eefdf30f
JA
4265 /* IOPOLL retry should happen for io-wq threads */
4266 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
f91daf56 4267 goto done;
75c668cd
PB
4268 /* no retry on NONBLOCK nor RWF_NOWAIT */
4269 if (req->flags & REQ_F_NOWAIT)
355afaeb 4270 goto done;
f38c7e3a 4271 ret = 0;
230d50d4
JA
4272 } else if (ret == -EIOCBQUEUED) {
4273 goto out_free;
cef216fc 4274 } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
7db30437 4275 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
7335e3bf 4276 /* read all, failed, already did sync or don't want to retry */
00d23d51 4277 goto done;
227c0c96
JA
4278 }
4279
cd658695
JA
4280 /*
4281 * Don't depend on the iter state matching what was consumed, or being
4282 * untouched in case of error. Restore it and we'll advance it
4283 * manually if we need to.
4284 */
c88598a9 4285 iov_iter_restore(&s->iter, &s->iter_state);
cd658695 4286
c88598a9 4287 ret2 = io_setup_async_rw(req, iovec, s, true);
6bf985dc
PB
4288 if (ret2)
4289 return ret2;
4290
fe1cdd55 4291 iovec = NULL;
e8c2bc1f 4292 rw = req->async_data;
c88598a9 4293 s = &rw->s;
cd658695
JA
4294 /*
4295 * Now use our persistent iterator and state, if we aren't already.
4296 * We've restored and mapped the iter to match.
4297 */
227c0c96 4298
b23df91b 4299 do {
cd658695
JA
4300 /*
4301 * We end up here because of a partial read, either from
4302 * above or inside this loop. Advance the iter by the bytes
4303 * that were consumed.
4304 */
c88598a9
PB
4305 iov_iter_advance(&s->iter, ret);
4306 if (!iov_iter_count(&s->iter))
cd658695 4307 break;
b23df91b 4308 rw->bytes_done += ret;
c88598a9 4309 iov_iter_save_state(&s->iter, &s->iter_state);
cd658695 4310
b23df91b
PB
4311 /* if we can retry, do so with the callbacks armed */
4312 if (!io_rw_should_retry(req)) {
4313 kiocb->ki_flags &= ~IOCB_WAITQ;
4314 return -EAGAIN;
4315 }
4316
4317 /*
4318 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
4319 * we get -EIOCBQUEUED, then we'll get a notification when the
4320 * desired page gets unlocked. We can also get a partial read
4321 * here, and if we do, then just retry at the new offset.
4322 */
c88598a9 4323 ret = io_iter_do_read(req, &s->iter);
b23df91b
PB
4324 if (ret == -EIOCBQUEUED)
4325 return 0;
227c0c96 4326 /* we got some bytes, but not all. retry. */
b5b0ecb7 4327 kiocb->ki_flags &= ~IOCB_WAITQ;
c88598a9 4328 iov_iter_restore(&s->iter, &s->iter_state);
cd658695 4329 } while (ret > 0);
227c0c96 4330done:
2ea537ca 4331 kiocb_done(req, ret, issue_flags);
fe1cdd55
PB
4332out_free:
4333 /* it's faster to check here then delegate to kfree */
4334 if (iovec)
4335 kfree(iovec);
5ea5dd45 4336 return 0;
2b188cc1
JA
4337}
4338
889fca73 4339static int io_write(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 4340{
607b6fb8 4341 struct io_rw_state __s, *s = &__s;
c88598a9 4342 struct iovec *iovec;
9adbd45d 4343 struct kiocb *kiocb = &req->rw.kiocb;
45d189c6 4344 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
cd658695 4345 ssize_t ret, ret2;
b4aec400 4346 loff_t *ppos;
2b188cc1 4347
607b6fb8 4348 if (!req_has_async_data(req)) {
5e49c973
PB
4349 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
4350 if (unlikely(ret < 0))
2846c481 4351 return ret;
607b6fb8
PB
4352 } else {
4353 struct io_async_rw *rw = req->async_data;
4354
4355 s = &rw->s;
4356 iov_iter_restore(&s->iter, &s->iter_state);
2846c481 4357 iovec = NULL;
2846c481 4358 }
584b0180 4359 ret = io_rw_init_file(req, FMODE_WRITE);
323b190b
JA
4360 if (unlikely(ret)) {
4361 kfree(iovec);
584b0180 4362 return ret;
323b190b 4363 }
cef216fc 4364 req->cqe.res = iov_iter_count(&s->iter);
2b188cc1 4365
607b6fb8
PB
4366 if (force_nonblock) {
4367 /* If the file doesn't support async, just async punt */
35645ac3 4368 if (unlikely(!io_file_supports_nowait(req)))
607b6fb8 4369 goto copy_iov;
fd6c2e4c 4370
607b6fb8
PB
4371 /* file path doesn't support NOWAIT for non-direct_IO */
4372 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
4373 (req->flags & REQ_F_ISREG))
4374 goto copy_iov;
31b51510 4375
607b6fb8
PB
4376 kiocb->ki_flags |= IOCB_NOWAIT;
4377 } else {
4378 /* Ensure we clear previously set non-block flag */
4379 kiocb->ki_flags &= ~IOCB_NOWAIT;
4380 }
31b51510 4381
b4aec400 4382 ppos = io_kiocb_update_pos(req);
d34e1e5b 4383
cef216fc 4384 ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
fa15bafb
PB
4385 if (unlikely(ret))
4386 goto out_free;
4ed734b0 4387
fa15bafb
PB
4388 /*
4389 * Open-code file_start_write here to grab freeze protection,
4390 * which will be released by another thread in
4391 * io_complete_rw(). Fool lockdep by telling it the lock got
4392 * released so that it doesn't complain about the held lock when
4393 * we return to userspace.
4394 */
4395 if (req->flags & REQ_F_ISREG) {
8a3c84b6 4396 sb_start_write(file_inode(req->file)->i_sb);
fa15bafb
PB
4397 __sb_writers_release(file_inode(req->file)->i_sb,
4398 SB_FREEZE_WRITE);
4399 }
4400 kiocb->ki_flags |= IOCB_WRITE;
4ed734b0 4401
35645ac3 4402 if (likely(req->file->f_op->write_iter))
c88598a9 4403 ret2 = call_write_iter(req->file, kiocb, &s->iter);
2dd2111d 4404 else if (req->file->f_op->write)
c88598a9 4405 ret2 = loop_rw_iter(WRITE, req, &s->iter);
2dd2111d
GH
4406 else
4407 ret2 = -EINVAL;
4ed734b0 4408
6ad7f233
PB
4409 if (req->flags & REQ_F_REISSUE) {
4410 req->flags &= ~REQ_F_REISSUE;
230d50d4 4411 ret2 = -EAGAIN;
6ad7f233 4412 }
230d50d4 4413
fa15bafb
PB
4414 /*
4415 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
4416 * retry them without IOCB_NOWAIT.
4417 */
4418 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
4419 ret2 = -EAGAIN;
75c668cd
PB
4420 /* no retry on NONBLOCK nor RWF_NOWAIT */
4421 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
355afaeb 4422 goto done;
fa15bafb 4423 if (!force_nonblock || ret2 != -EAGAIN) {
eefdf30f 4424 /* IOPOLL retry should happen for io-wq threads */
b10841c9 4425 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
eefdf30f 4426 goto copy_iov;
355afaeb 4427done:
2ea537ca 4428 kiocb_done(req, ret2, issue_flags);
fa15bafb 4429 } else {
f67676d1 4430copy_iov:
c88598a9
PB
4431 iov_iter_restore(&s->iter, &s->iter_state);
4432 ret = io_setup_async_rw(req, iovec, s, false);
6bf985dc 4433 return ret ?: -EAGAIN;
2b188cc1 4434 }
31b51510 4435out_free:
f261c168 4436 /* it's reportedly faster than delegating the null check to kfree() */
252917c3 4437 if (iovec)
6f2cc166 4438 kfree(iovec);
2b188cc1
JA
4439 return ret;
4440}
4441
80a261fd
JA
4442static int io_renameat_prep(struct io_kiocb *req,
4443 const struct io_uring_sqe *sqe)
4444{
4445 struct io_rename *ren = &req->rename;
4446 const char __user *oldf, *newf;
4447
73911426 4448 if (sqe->buf_index || sqe->splice_fd_in)
ed7eb259 4449 return -EINVAL;
80a261fd
JA
4450 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4451 return -EBADF;
4452
4453 ren->old_dfd = READ_ONCE(sqe->fd);
4454 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4455 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4456 ren->new_dfd = READ_ONCE(sqe->len);
4457 ren->flags = READ_ONCE(sqe->rename_flags);
4458
4459 ren->oldpath = getname(oldf);
4460 if (IS_ERR(ren->oldpath))
4461 return PTR_ERR(ren->oldpath);
4462
4463 ren->newpath = getname(newf);
4464 if (IS_ERR(ren->newpath)) {
4465 putname(ren->oldpath);
4466 return PTR_ERR(ren->newpath);
4467 }
4468
4469 req->flags |= REQ_F_NEED_CLEANUP;
4470 return 0;
4471}
4472
45d189c6 4473static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
80a261fd
JA
4474{
4475 struct io_rename *ren = &req->rename;
4476 int ret;
4477
45d189c6 4478 if (issue_flags & IO_URING_F_NONBLOCK)
80a261fd
JA
4479 return -EAGAIN;
4480
4481 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
4482 ren->newpath, ren->flags);
4483
4484 req->flags &= ~REQ_F_NEED_CLEANUP;
4485 if (ret < 0)
93d2bcd2 4486 req_set_fail(req);
80a261fd
JA
4487 io_req_complete(req, ret);
4488 return 0;
4489}
4490
e9621e2b
SR
4491static inline void __io_xattr_finish(struct io_kiocb *req)
4492{
4493 struct io_xattr *ix = &req->xattr;
4494
4495 if (ix->filename)
4496 putname(ix->filename);
4497
4498 kfree(ix->ctx.kname);
4499 kvfree(ix->ctx.kvalue);
4500}
4501
4502static void io_xattr_finish(struct io_kiocb *req, int ret)
4503{
4504 req->flags &= ~REQ_F_NEED_CLEANUP;
4505
4506 __io_xattr_finish(req);
4507 if (ret < 0)
4508 req_set_fail(req);
4509
4510 io_req_complete(req, ret);
4511}
4512
a56834e0
SR
4513static int __io_getxattr_prep(struct io_kiocb *req,
4514 const struct io_uring_sqe *sqe)
4515{
4516 struct io_xattr *ix = &req->xattr;
4517 const char __user *name;
4518 int ret;
4519
a56834e0
SR
4520 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4521 return -EBADF;
4522
4523 ix->filename = NULL;
4524 ix->ctx.kvalue = NULL;
4525 name = u64_to_user_ptr(READ_ONCE(sqe->addr));
4526 ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4527 ix->ctx.size = READ_ONCE(sqe->len);
4528 ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
4529
4530 if (ix->ctx.flags)
4531 return -EINVAL;
4532
4533 ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
4534 if (!ix->ctx.kname)
4535 return -ENOMEM;
4536
4537 ret = strncpy_from_user(ix->ctx.kname->name, name,
4538 sizeof(ix->ctx.kname->name));
4539 if (!ret || ret == sizeof(ix->ctx.kname->name))
4540 ret = -ERANGE;
4541 if (ret < 0) {
4542 kfree(ix->ctx.kname);
4543 return ret;
4544 }
4545
4546 req->flags |= REQ_F_NEED_CLEANUP;
4547 return 0;
4548}
4549
4550static int io_fgetxattr_prep(struct io_kiocb *req,
4551 const struct io_uring_sqe *sqe)
4552{
4553 return __io_getxattr_prep(req, sqe);
4554}
4555
4556static int io_getxattr_prep(struct io_kiocb *req,
4557 const struct io_uring_sqe *sqe)
4558{
4559 struct io_xattr *ix = &req->xattr;
4560 const char __user *path;
4561 int ret;
4562
4563 ret = __io_getxattr_prep(req, sqe);
4564 if (ret)
4565 return ret;
4566
4567 path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
4568
4569 ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
4570 if (IS_ERR(ix->filename)) {
4571 ret = PTR_ERR(ix->filename);
4572 ix->filename = NULL;
4573 }
4574
4575 return ret;
4576}
4577
4578static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
4579{
4580 struct io_xattr *ix = &req->xattr;
4581 int ret;
4582
4583 if (issue_flags & IO_URING_F_NONBLOCK)
4584 return -EAGAIN;
4585
4586 ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
4587 req->file->f_path.dentry,
4588 &ix->ctx);
4589
4590 io_xattr_finish(req, ret);
4591 return 0;
4592}
4593
4594static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
4595{
4596 struct io_xattr *ix = &req->xattr;
4597 unsigned int lookup_flags = LOOKUP_FOLLOW;
4598 struct path path;
4599 int ret;
4600
4601 if (issue_flags & IO_URING_F_NONBLOCK)
4602 return -EAGAIN;
4603
4604retry:
4605 ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
4606 if (!ret) {
4607 ret = do_getxattr(mnt_user_ns(path.mnt),
4608 path.dentry,
4609 &ix->ctx);
4610
4611 path_put(&path);
4612 if (retry_estale(ret, lookup_flags)) {
4613 lookup_flags |= LOOKUP_REVAL;
4614 goto retry;
4615 }
4616 }
4617
4618 io_xattr_finish(req, ret);
4619 return 0;
4620}
4621
e9621e2b
SR
4622static int __io_setxattr_prep(struct io_kiocb *req,
4623 const struct io_uring_sqe *sqe)
4624{
4625 struct io_xattr *ix = &req->xattr;
4626 const char __user *name;
4627 int ret;
4628
e9621e2b
SR
4629 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4630 return -EBADF;
4631
4632 ix->filename = NULL;
4633 name = u64_to_user_ptr(READ_ONCE(sqe->addr));
4634 ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4635 ix->ctx.kvalue = NULL;
4636 ix->ctx.size = READ_ONCE(sqe->len);
4637 ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
4638
4639 ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
4640 if (!ix->ctx.kname)
4641 return -ENOMEM;
4642
4643 ret = setxattr_copy(name, &ix->ctx);
4644 if (ret) {
4645 kfree(ix->ctx.kname);
4646 return ret;
4647 }
4648
4649 req->flags |= REQ_F_NEED_CLEANUP;
4650 return 0;
4651}
4652
4653static int io_setxattr_prep(struct io_kiocb *req,
4654 const struct io_uring_sqe *sqe)
4655{
4656 struct io_xattr *ix = &req->xattr;
4657 const char __user *path;
4658 int ret;
4659
4660 ret = __io_setxattr_prep(req, sqe);
4661 if (ret)
4662 return ret;
4663
4664 path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
4665
4666 ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
4667 if (IS_ERR(ix->filename)) {
4668 ret = PTR_ERR(ix->filename);
4669 ix->filename = NULL;
4670 }
4671
4672 return ret;
4673}
4674
4675static int io_fsetxattr_prep(struct io_kiocb *req,
4676 const struct io_uring_sqe *sqe)
4677{
4678 return __io_setxattr_prep(req, sqe);
4679}
4680
4681static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
4682 struct path *path)
4683{
4684 struct io_xattr *ix = &req->xattr;
4685 int ret;
4686
4687 ret = mnt_want_write(path->mnt);
4688 if (!ret) {
4689 ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
4690 mnt_drop_write(path->mnt);
4691 }
4692
4693 return ret;
4694}
4695
4696static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
4697{
4698 int ret;
4699
4700 if (issue_flags & IO_URING_F_NONBLOCK)
4701 return -EAGAIN;
4702
4703 ret = __io_setxattr(req, issue_flags, &req->file->f_path);
4704 io_xattr_finish(req, ret);
4705
4706 return 0;
4707}
4708
4709static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
4710{
4711 struct io_xattr *ix = &req->xattr;
4712 unsigned int lookup_flags = LOOKUP_FOLLOW;
4713 struct path path;
4714 int ret;
4715
4716 if (issue_flags & IO_URING_F_NONBLOCK)
4717 return -EAGAIN;
4718
4719retry:
4720 ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
4721 if (!ret) {
4722 ret = __io_setxattr(req, issue_flags, &path);
4723 path_put(&path);
4724 if (retry_estale(ret, lookup_flags)) {
4725 lookup_flags |= LOOKUP_REVAL;
4726 goto retry;
4727 }
4728 }
4729
4730 io_xattr_finish(req, ret);
4731 return 0;
4732}
4733
14a1143b
JA
4734static int io_unlinkat_prep(struct io_kiocb *req,
4735 const struct io_uring_sqe *sqe)
4736{
4737 struct io_unlink *un = &req->unlink;
4738 const char __user *fname;
4739
73911426 4740 if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
22634bc5 4741 return -EINVAL;
14a1143b
JA
4742 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4743 return -EBADF;
4744
4745 un->dfd = READ_ONCE(sqe->fd);
4746
4747 un->flags = READ_ONCE(sqe->unlink_flags);
4748 if (un->flags & ~AT_REMOVEDIR)
4749 return -EINVAL;
4750
4751 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4752 un->filename = getname(fname);
4753 if (IS_ERR(un->filename))
4754 return PTR_ERR(un->filename);
4755
4756 req->flags |= REQ_F_NEED_CLEANUP;
4757 return 0;
4758}
4759
45d189c6 4760static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
14a1143b
JA
4761{
4762 struct io_unlink *un = &req->unlink;
4763 int ret;
4764
45d189c6 4765 if (issue_flags & IO_URING_F_NONBLOCK)
14a1143b
JA
4766 return -EAGAIN;
4767
4768 if (un->flags & AT_REMOVEDIR)
4769 ret = do_rmdir(un->dfd, un->filename);
4770 else
4771 ret = do_unlinkat(un->dfd, un->filename);
4772
4773 req->flags &= ~REQ_F_NEED_CLEANUP;
4774 if (ret < 0)
93d2bcd2 4775 req_set_fail(req);
14a1143b
JA
4776 io_req_complete(req, ret);
4777 return 0;
4778}
4779
e34a02dc
DK
4780static int io_mkdirat_prep(struct io_kiocb *req,
4781 const struct io_uring_sqe *sqe)
4782{
4783 struct io_mkdir *mkd = &req->mkdir;
4784 const char __user *fname;
4785
73911426 4786 if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
e34a02dc
DK
4787 return -EINVAL;
4788 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4789 return -EBADF;
4790
4791 mkd->dfd = READ_ONCE(sqe->fd);
4792 mkd->mode = READ_ONCE(sqe->len);
4793
4794 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4795 mkd->filename = getname(fname);
4796 if (IS_ERR(mkd->filename))
4797 return PTR_ERR(mkd->filename);
4798
4799 req->flags |= REQ_F_NEED_CLEANUP;
4800 return 0;
4801}
4802
04f34081 4803static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
e34a02dc
DK
4804{
4805 struct io_mkdir *mkd = &req->mkdir;
4806 int ret;
4807
4808 if (issue_flags & IO_URING_F_NONBLOCK)
4809 return -EAGAIN;
4810
4811 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
4812
4813 req->flags &= ~REQ_F_NEED_CLEANUP;
4814 if (ret < 0)
4815 req_set_fail(req);
4816 io_req_complete(req, ret);
4817 return 0;
4818}
4819
7a8721f8
DK
4820static int io_symlinkat_prep(struct io_kiocb *req,
4821 const struct io_uring_sqe *sqe)
4822{
4823 struct io_symlink *sl = &req->symlink;
4824 const char __user *oldpath, *newpath;
4825
73911426 4826 if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
7a8721f8
DK
4827 return -EINVAL;
4828 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4829 return -EBADF;
4830
4831 sl->new_dfd = READ_ONCE(sqe->fd);
4832 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
4833 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4834
4835 sl->oldpath = getname(oldpath);
4836 if (IS_ERR(sl->oldpath))
4837 return PTR_ERR(sl->oldpath);
4838
4839 sl->newpath = getname(newpath);
4840 if (IS_ERR(sl->newpath)) {
4841 putname(sl->oldpath);
4842 return PTR_ERR(sl->newpath);
4843 }
4844
4845 req->flags |= REQ_F_NEED_CLEANUP;
4846 return 0;
4847}
4848
04f34081 4849static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
7a8721f8
DK
4850{
4851 struct io_symlink *sl = &req->symlink;
4852 int ret;
4853
4854 if (issue_flags & IO_URING_F_NONBLOCK)
4855 return -EAGAIN;
4856
4857 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
4858
4859 req->flags &= ~REQ_F_NEED_CLEANUP;
4860 if (ret < 0)
4861 req_set_fail(req);
4862 io_req_complete(req, ret);
4863 return 0;
4864}
4865
cf30da90
DK
4866static int io_linkat_prep(struct io_kiocb *req,
4867 const struct io_uring_sqe *sqe)
4868{
4869 struct io_hardlink *lnk = &req->hardlink;
4870 const char __user *oldf, *newf;
4871
73911426 4872 if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
cf30da90
DK
4873 return -EINVAL;
4874 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4875 return -EBADF;
4876
4877 lnk->old_dfd = READ_ONCE(sqe->fd);
4878 lnk->new_dfd = READ_ONCE(sqe->len);
4879 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4880 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4881 lnk->flags = READ_ONCE(sqe->hardlink_flags);
4882
4883 lnk->oldpath = getname(oldf);
4884 if (IS_ERR(lnk->oldpath))
4885 return PTR_ERR(lnk->oldpath);
4886
4887 lnk->newpath = getname(newf);
4888 if (IS_ERR(lnk->newpath)) {
4889 putname(lnk->oldpath);
4890 return PTR_ERR(lnk->newpath);
4891 }
4892
4893 req->flags |= REQ_F_NEED_CLEANUP;
4894 return 0;
4895}
4896
04f34081 4897static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
cf30da90
DK
4898{
4899 struct io_hardlink *lnk = &req->hardlink;
4900 int ret;
4901
4902 if (issue_flags & IO_URING_F_NONBLOCK)
4903 return -EAGAIN;
4904
4905 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
4906 lnk->newpath, lnk->flags);
4907
4908 req->flags &= ~REQ_F_NEED_CLEANUP;
4909 if (ret < 0)
4910 req_set_fail(req);
4911 io_req_complete(req, ret);
4912 return 0;
4913}
4914
ee692a21
JA
4915static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
4916{
4917 req->uring_cmd.task_work_cb(&req->uring_cmd);
4918}
4919
4920void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
4921 void (*task_work_cb)(struct io_uring_cmd *))
4922{
4923 struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
4924
4925 req->uring_cmd.task_work_cb = task_work_cb;
4926 req->io_task_work.func = io_uring_cmd_work;
4927 io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
4928}
4929EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
4930
4931/*
4932 * Called by consumers of io_uring_cmd, if they originally returned
4933 * -EIOCBQUEUED upon receiving the command.
4934 */
4935void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
4936{
4937 struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
4938
4939 if (ret < 0)
4940 req_set_fail(req);
4941 if (req->ctx->flags & IORING_SETUP_CQE32)
4942 __io_req_complete32(req, 0, ret, 0, res2, 0);
4943 else
4944 io_req_complete(req, ret);
4945}
4946EXPORT_SYMBOL_GPL(io_uring_cmd_done);
4947
4948static int io_uring_cmd_prep_async(struct io_kiocb *req)
4949{
4950 size_t cmd_size;
4951
4952 cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
4953
4954 memcpy(req->async_data, req->uring_cmd.cmd, cmd_size);
4955 return 0;
4956}
4957
4958static int io_uring_cmd_prep(struct io_kiocb *req,
4959 const struct io_uring_sqe *sqe)
4960{
4961 struct io_uring_cmd *ioucmd = &req->uring_cmd;
4962
4963 if (sqe->rw_flags)
4964 return -EINVAL;
4965 ioucmd->cmd = sqe->cmd;
4966 ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
4967 return 0;
4968}
4969
4970static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
4971{
4972 struct io_uring_cmd *ioucmd = &req->uring_cmd;
4973 struct io_ring_ctx *ctx = req->ctx;
4974 struct file *file = req->file;
4975 int ret;
4976
4977 if (!req->file->f_op->uring_cmd)
4978 return -EOPNOTSUPP;
4979
4980 if (ctx->flags & IORING_SETUP_SQE128)
4981 issue_flags |= IO_URING_F_SQE128;
4982 if (ctx->flags & IORING_SETUP_CQE32)
4983 issue_flags |= IO_URING_F_CQE32;
4984 if (ctx->flags & IORING_SETUP_IOPOLL)
4985 issue_flags |= IO_URING_F_IOPOLL;
4986
4987 if (req_has_async_data(req))
4988 ioucmd->cmd = req->async_data;
4989
4990 ret = file->f_op->uring_cmd(ioucmd, issue_flags);
4991 if (ret == -EAGAIN) {
4992 if (!req_has_async_data(req)) {
4993 if (io_alloc_async_data(req))
4994 return -ENOMEM;
4995 io_uring_cmd_prep_async(req);
4996 }
4997 return -EAGAIN;
4998 }
4999
5000 if (ret != -EIOCBQUEUED)
5001 io_uring_cmd_done(ioucmd, ret, 0);
5002 return 0;
5003}
5004
36f4fa68
JA
5005static int io_shutdown_prep(struct io_kiocb *req,
5006 const struct io_uring_sqe *sqe)
5007{
5008#if defined(CONFIG_NET)
73911426 5009 if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
26578cda 5010 sqe->buf_index || sqe->splice_fd_in))
36f4fa68
JA
5011 return -EINVAL;
5012
5013 req->shutdown.how = READ_ONCE(sqe->len);
5014 return 0;
5015#else
5016 return -EOPNOTSUPP;
5017#endif
5018}
5019
45d189c6 5020static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
36f4fa68
JA
5021{
5022#if defined(CONFIG_NET)
5023 struct socket *sock;
5024 int ret;
5025
45d189c6 5026 if (issue_flags & IO_URING_F_NONBLOCK)
36f4fa68
JA
5027 return -EAGAIN;
5028
48aba79b 5029 sock = sock_from_file(req->file);
36f4fa68 5030 if (unlikely(!sock))
48aba79b 5031 return -ENOTSOCK;
36f4fa68
JA
5032
5033 ret = __sys_shutdown_sock(sock, req->shutdown.how);
a146468d 5034 if (ret < 0)
93d2bcd2 5035 req_set_fail(req);
36f4fa68
JA
5036 io_req_complete(req, ret);
5037 return 0;
5038#else
5039 return -EOPNOTSUPP;
5040#endif
5041}
5042
f2a8d5c7
PB
5043static int __io_splice_prep(struct io_kiocb *req,
5044 const struct io_uring_sqe *sqe)
7d67af2c 5045{
fe7e3257 5046 struct io_splice *sp = &req->splice;
7d67af2c 5047 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
7d67af2c 5048
7d67af2c
PB
5049 sp->len = READ_ONCE(sqe->len);
5050 sp->flags = READ_ONCE(sqe->splice_flags);
7d67af2c
PB
5051 if (unlikely(sp->flags & ~valid_flags))
5052 return -EINVAL;
a3e4bc23 5053 sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
7d67af2c
PB
5054 return 0;
5055}
5056
f2a8d5c7
PB
5057static int io_tee_prep(struct io_kiocb *req,
5058 const struct io_uring_sqe *sqe)
5059{
5060 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
5061 return -EINVAL;
5062 return __io_splice_prep(req, sqe);
5063}
5064
45d189c6 5065static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
f2a8d5c7
PB
5066{
5067 struct io_splice *sp = &req->splice;
f2a8d5c7
PB
5068 struct file *out = sp->file_out;
5069 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
a3e4bc23 5070 struct file *in;
f2a8d5c7
PB
5071 long ret = 0;
5072
45d189c6 5073 if (issue_flags & IO_URING_F_NONBLOCK)
f2a8d5c7 5074 return -EAGAIN;
a3e4bc23 5075
5106dd6e 5076 if (sp->flags & SPLICE_F_FD_IN_FIXED)
e9419766 5077 in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
5106dd6e
JA
5078 else
5079 in = io_file_get_normal(req, sp->splice_fd_in);
a3e4bc23
JA
5080 if (!in) {
5081 ret = -EBADF;
5082 goto done;
5083 }
5084
f2a8d5c7
PB
5085 if (sp->len)
5086 ret = do_tee(in, out, sp->len, flags);
5087
e1d767f0
PB
5088 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
5089 io_put_file(in);
a3e4bc23 5090done:
f2a8d5c7 5091 if (ret != sp->len)
93d2bcd2 5092 req_set_fail(req);
e1e16097 5093 io_req_complete(req, ret);
f2a8d5c7
PB
5094 return 0;
5095}
5096
5097static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5098{
fe7e3257 5099 struct io_splice *sp = &req->splice;
f2a8d5c7
PB
5100
5101 sp->off_in = READ_ONCE(sqe->splice_off_in);
5102 sp->off_out = READ_ONCE(sqe->off);
5103 return __io_splice_prep(req, sqe);
5104}
5105
45d189c6 5106static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
7d67af2c
PB
5107{
5108 struct io_splice *sp = &req->splice;
7d67af2c
PB
5109 struct file *out = sp->file_out;
5110 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
5111 loff_t *poff_in, *poff_out;
a3e4bc23 5112 struct file *in;
c9687426 5113 long ret = 0;
7d67af2c 5114
45d189c6 5115 if (issue_flags & IO_URING_F_NONBLOCK)
2fb3e822 5116 return -EAGAIN;
7d67af2c 5117
5106dd6e 5118 if (sp->flags & SPLICE_F_FD_IN_FIXED)
e9419766 5119 in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
5106dd6e
JA
5120 else
5121 in = io_file_get_normal(req, sp->splice_fd_in);
a3e4bc23
JA
5122 if (!in) {
5123 ret = -EBADF;
5124 goto done;
5125 }
5126
7d67af2c
PB
5127 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
5128 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
c9687426 5129
948a7749 5130 if (sp->len)
c9687426 5131 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
7d67af2c 5132
e1d767f0
PB
5133 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
5134 io_put_file(in);
a3e4bc23 5135done:
7d67af2c 5136 if (ret != sp->len)
93d2bcd2 5137 req_set_fail(req);
e1e16097 5138 io_req_complete(req, ret);
7d67af2c
PB
5139 return 0;
5140}
5141
2bb04df7
SR
5142static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5143{
5144 /*
5145 * If the ring is setup with CQE32, relay back addr/addr
5146 */
5147 if (req->ctx->flags & IORING_SETUP_CQE32) {
5148 req->nop.extra1 = READ_ONCE(sqe->addr);
5149 req->nop.extra2 = READ_ONCE(sqe->addr2);
5150 }
5151
5152 return 0;
5153}
5154
2b188cc1
JA
5155/*
5156 * IORING_OP_NOP just posts a completion event, nothing else.
5157 */
889fca73 5158static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 5159{
2bb04df7
SR
5160 if (!(req->ctx->flags & IORING_SETUP_CQE32))
5161 __io_req_complete(req, issue_flags, 0, 0);
5162 else
5163 __io_req_complete32(req, issue_flags, 0, 0, req->nop.extra1,
5164 req->nop.extra2);
2b188cc1
JA
5165 return 0;
5166}
5167
4f57f06c
JA
5168static int io_msg_ring_prep(struct io_kiocb *req,
5169 const struct io_uring_sqe *sqe)
5170{
73911426
JA
5171 if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
5172 sqe->buf_index || sqe->personality))
4f57f06c
JA
5173 return -EINVAL;
5174
4f57f06c
JA
5175 req->msg.user_data = READ_ONCE(sqe->off);
5176 req->msg.len = READ_ONCE(sqe->len);
5177 return 0;
5178}
5179
5180static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
5181{
5182 struct io_ring_ctx *target_ctx;
5183 struct io_msg *msg = &req->msg;
4f57f06c 5184 bool filled;
3f1d52ab 5185 int ret;
4f57f06c 5186
3f1d52ab
JA
5187 ret = -EBADFD;
5188 if (req->file->f_op != &io_uring_fops)
5189 goto done;
4f57f06c 5190
3f1d52ab 5191 ret = -EOVERFLOW;
4f57f06c
JA
5192 target_ctx = req->file->private_data;
5193
5194 spin_lock(&target_ctx->completion_lock);
7ef66d18 5195 filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
4f57f06c
JA
5196 io_commit_cqring(target_ctx);
5197 spin_unlock(&target_ctx->completion_lock);
5198
5199 if (filled) {
5200 io_cqring_ev_posted(target_ctx);
5201 ret = 0;
5202 }
5203
3f1d52ab 5204done:
9666d420
JA
5205 if (ret < 0)
5206 req_set_fail(req);
4f57f06c
JA
5207 __io_req_complete(req, issue_flags, ret, 0);
5208 return 0;
5209}
5210
1155c76a 5211static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
c992fe29 5212{
73911426 5213 if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
c992fe29
CH
5214 return -EINVAL;
5215
8ed8d3c3
JA
5216 req->sync.flags = READ_ONCE(sqe->fsync_flags);
5217 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
5218 return -EINVAL;
5219
5220 req->sync.off = READ_ONCE(sqe->off);
5221 req->sync.len = READ_ONCE(sqe->len);
c992fe29
CH
5222 return 0;
5223}
5224
45d189c6 5225static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 5226{
8ed8d3c3 5227 loff_t end = req->sync.off + req->sync.len;
8ed8d3c3
JA
5228 int ret;
5229
ac45abc0 5230 /* fsync always requires a blocking context */
45d189c6 5231 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
5232 return -EAGAIN;
5233
9adbd45d 5234 ret = vfs_fsync_range(req->file, req->sync.off,
8ed8d3c3
JA
5235 end > 0 ? end : LLONG_MAX,
5236 req->sync.flags & IORING_FSYNC_DATASYNC);
5237 if (ret < 0)
93d2bcd2 5238 req_set_fail(req);
e1e16097 5239 io_req_complete(req, ret);
c992fe29
CH
5240 return 0;
5241}
5242
d63d1b5e
JA
5243static int io_fallocate_prep(struct io_kiocb *req,
5244 const struct io_uring_sqe *sqe)
5245{
73911426 5246 if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
3232dd02 5247 return -EINVAL;
d63d1b5e
JA
5248
5249 req->sync.off = READ_ONCE(sqe->off);
5250 req->sync.len = READ_ONCE(sqe->addr);
5251 req->sync.mode = READ_ONCE(sqe->len);
5252 return 0;
5253}
5254
45d189c6 5255static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
5d17b4a4 5256{
ac45abc0
PB
5257 int ret;
5258
d63d1b5e 5259 /* fallocate always requiring blocking context */
45d189c6 5260 if (issue_flags & IO_URING_F_NONBLOCK)
5d17b4a4 5261 return -EAGAIN;
ac45abc0
PB
5262 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
5263 req->sync.len);
ac45abc0 5264 if (ret < 0)
93d2bcd2 5265 req_set_fail(req);
f63cf519
JA
5266 else
5267 fsnotify_modify(req->file);
e1e16097 5268 io_req_complete(req, ret);
5d17b4a4
JA
5269 return 0;
5270}
5271
ec65fea5 5272static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
b7bb4f7d 5273{
f8748881 5274 const char __user *fname;
15b71abe 5275 int ret;
b7bb4f7d 5276
73911426 5277 if (unlikely(sqe->buf_index))
15b71abe 5278 return -EINVAL;
ec65fea5 5279 if (unlikely(req->flags & REQ_F_FIXED_FILE))
cf3040ca 5280 return -EBADF;
03b1230c 5281
ec65fea5
PB
5282 /* open.how should be already initialised */
5283 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
08a1d26e 5284 req->open.how.flags |= O_LARGEFILE;
3529d8c2 5285
25e72d10
PB
5286 req->open.dfd = READ_ONCE(sqe->fd);
5287 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
f8748881 5288 req->open.filename = getname(fname);
15b71abe
JA
5289 if (IS_ERR(req->open.filename)) {
5290 ret = PTR_ERR(req->open.filename);
5291 req->open.filename = NULL;
5292 return ret;
5293 }
b9445598
PB
5294
5295 req->open.file_slot = READ_ONCE(sqe->file_index);
5296 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
5297 return -EINVAL;
5298
4022e7af 5299 req->open.nofile = rlimit(RLIMIT_NOFILE);
8fef80bf 5300 req->flags |= REQ_F_NEED_CLEANUP;
15b71abe 5301 return 0;
03b1230c
JA
5302}
5303
ec65fea5
PB
5304static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5305{
d3fddf6d
PB
5306 u64 mode = READ_ONCE(sqe->len);
5307 u64 flags = READ_ONCE(sqe->open_flags);
ec65fea5 5308
ec65fea5
PB
5309 req->open.how = build_open_how(flags, mode);
5310 return __io_openat_prep(req, sqe);
5311}
5312
cebdb986 5313static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
aa1fa28f 5314{
cebdb986 5315 struct open_how __user *how;
cebdb986 5316 size_t len;
0fa03c62
JA
5317 int ret;
5318
cebdb986
JA
5319 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5320 len = READ_ONCE(sqe->len);
cebdb986
JA
5321 if (len < OPEN_HOW_SIZE_VER0)
5322 return -EINVAL;
3529d8c2 5323
cebdb986
JA
5324 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
5325 len);
5326 if (ret)
5327 return ret;
3529d8c2 5328
ec65fea5 5329 return __io_openat_prep(req, sqe);
cebdb986
JA
5330}
5331
45d189c6 5332static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
15b71abe
JA
5333{
5334 struct open_flags op;
15b71abe 5335 struct file *file;
b9445598
PB
5336 bool resolve_nonblock, nonblock_set;
5337 bool fixed = !!req->open.file_slot;
15b71abe
JA
5338 int ret;
5339
cebdb986 5340 ret = build_open_flags(&req->open.how, &op);
15b71abe
JA
5341 if (ret)
5342 goto err;
3a81fd02
JA
5343 nonblock_set = op.open_flag & O_NONBLOCK;
5344 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
45d189c6 5345 if (issue_flags & IO_URING_F_NONBLOCK) {
3a81fd02
JA
5346 /*
5347 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
5348 * it'll always -EAGAIN
5349 */
5350 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
5351 return -EAGAIN;
5352 op.lookup_flags |= LOOKUP_CACHED;
5353 op.open_flag |= O_NONBLOCK;
5354 }
15b71abe 5355
b9445598
PB
5356 if (!fixed) {
5357 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
5358 if (ret < 0)
5359 goto err;
5360 }
15b71abe
JA
5361
5362 file = do_filp_open(req->open.dfd, req->open.filename, &op);
12dcb58a 5363 if (IS_ERR(file)) {
944d1444 5364 /*
12dcb58a
PB
5365 * We could hang on to this 'fd' on retrying, but seems like
5366 * marginal gain for something that is now known to be a slower
5367 * path. So just put it, and we'll get a new one when we retry.
944d1444 5368 */
b9445598
PB
5369 if (!fixed)
5370 put_unused_fd(ret);
3a81fd02 5371
15b71abe 5372 ret = PTR_ERR(file);
12dcb58a
PB
5373 /* only retry if RESOLVE_CACHED wasn't already set by application */
5374 if (ret == -EAGAIN &&
5375 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
5376 return -EAGAIN;
5377 goto err;
15b71abe 5378 }
12dcb58a
PB
5379
5380 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
5381 file->f_flags &= ~O_NONBLOCK;
5382 fsnotify_open(file);
b9445598
PB
5383
5384 if (!fixed)
5385 fd_install(ret, file);
5386 else
5387 ret = io_install_fixed_file(req, file, issue_flags,
5388 req->open.file_slot - 1);
15b71abe
JA
5389err:
5390 putname(req->open.filename);
8fef80bf 5391 req->flags &= ~REQ_F_NEED_CLEANUP;
15b71abe 5392 if (ret < 0)
93d2bcd2 5393 req_set_fail(req);
0bdf3398 5394 __io_req_complete(req, issue_flags, ret, 0);
15b71abe
JA
5395 return 0;
5396}
5397
45d189c6 5398static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
cebdb986 5399{
e45cff58 5400 return io_openat2(req, issue_flags);
cebdb986
JA
5401}
5402
067524e9
JA
5403static int io_remove_buffers_prep(struct io_kiocb *req,
5404 const struct io_uring_sqe *sqe)
5405{
5406 struct io_provide_buf *p = &req->pbuf;
5407 u64 tmp;
5408
73911426 5409 if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
26578cda 5410 sqe->splice_fd_in)
067524e9
JA
5411 return -EINVAL;
5412
5413 tmp = READ_ONCE(sqe->fd);
5414 if (!tmp || tmp > USHRT_MAX)
5415 return -EINVAL;
5416
5417 memset(p, 0, sizeof(*p));
5418 p->nbufs = tmp;
5419 p->bgid = READ_ONCE(sqe->buf_group);
5420 return 0;
5421}
5422
dbc7d452
JA
5423static int __io_remove_buffers(struct io_ring_ctx *ctx,
5424 struct io_buffer_list *bl, unsigned nbufs)
067524e9
JA
5425{
5426 unsigned i = 0;
5427
5428 /* shouldn't happen */
5429 if (!nbufs)
5430 return 0;
5431
5432 /* the head kbuf is the list itself */
dbc7d452 5433 while (!list_empty(&bl->buf_list)) {
067524e9
JA
5434 struct io_buffer *nxt;
5435
dbc7d452 5436 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
067524e9 5437 list_del(&nxt->list);
067524e9
JA
5438 if (++i == nbufs)
5439 return i;
1d0254e6 5440 cond_resched();
067524e9
JA
5441 }
5442 i++;
067524e9
JA
5443
5444 return i;
5445}
5446
889fca73 5447static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
067524e9
JA
5448{
5449 struct io_provide_buf *p = &req->pbuf;
5450 struct io_ring_ctx *ctx = req->ctx;
dbc7d452 5451 struct io_buffer_list *bl;
067524e9 5452 int ret = 0;
067524e9 5453
f8929630 5454 io_ring_submit_lock(ctx, issue_flags);
067524e9
JA
5455
5456 ret = -ENOENT;
dbc7d452
JA
5457 bl = io_buffer_get_list(ctx, p->bgid);
5458 if (bl)
5459 ret = __io_remove_buffers(ctx, bl, p->nbufs);
067524e9 5460 if (ret < 0)
93d2bcd2 5461 req_set_fail(req);
067524e9 5462
9fb8cb49
PB
5463 /* complete before unlock, IOPOLL may need the lock */
5464 __io_req_complete(req, issue_flags, ret, 0);
f8929630 5465 io_ring_submit_unlock(ctx, issue_flags);
067524e9
JA
5466 return 0;
5467}
5468
ddf0322d
JA
5469static int io_provide_buffers_prep(struct io_kiocb *req,
5470 const struct io_uring_sqe *sqe)
5471{
38134ada 5472 unsigned long size, tmp_check;
ddf0322d
JA
5473 struct io_provide_buf *p = &req->pbuf;
5474 u64 tmp;
5475
73911426 5476 if (sqe->rw_flags || sqe->splice_fd_in)
ddf0322d
JA
5477 return -EINVAL;
5478
5479 tmp = READ_ONCE(sqe->fd);
5480 if (!tmp || tmp > USHRT_MAX)
5481 return -E2BIG;
5482 p->nbufs = tmp;
5483 p->addr = READ_ONCE(sqe->addr);
5484 p->len = READ_ONCE(sqe->len);
5485
38134ada
PB
5486 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
5487 &size))
5488 return -EOVERFLOW;
5489 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
5490 return -EOVERFLOW;
5491
d81269fe
PB
5492 size = (unsigned long)p->len * p->nbufs;
5493 if (!access_ok(u64_to_user_ptr(p->addr), size))
ddf0322d
JA
5494 return -EFAULT;
5495
5496 p->bgid = READ_ONCE(sqe->buf_group);
5497 tmp = READ_ONCE(sqe->off);
5498 if (tmp > USHRT_MAX)
5499 return -E2BIG;
5500 p->bid = tmp;
5501 return 0;
5502}
5503
cc3cec83
JA
5504static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
5505{
5506 struct io_buffer *buf;
5507 struct page *page;
5508 int bufs_in_page;
5509
5510 /*
5511 * Completions that don't happen inline (eg not under uring_lock) will
5512 * add to ->io_buffers_comp. If we don't have any free buffers, check
5513 * the completion list and splice those entries first.
5514 */
5515 if (!list_empty_careful(&ctx->io_buffers_comp)) {
5516 spin_lock(&ctx->completion_lock);
5517 if (!list_empty(&ctx->io_buffers_comp)) {
5518 list_splice_init(&ctx->io_buffers_comp,
5519 &ctx->io_buffers_cache);
5520 spin_unlock(&ctx->completion_lock);
5521 return 0;
5522 }
5523 spin_unlock(&ctx->completion_lock);
5524 }
5525
5526 /*
5527 * No free buffers and no completion entries either. Allocate a new
5528 * page worth of buffer entries and add those to our freelist.
5529 */
5530 page = alloc_page(GFP_KERNEL_ACCOUNT);
5531 if (!page)
5532 return -ENOMEM;
5533
5534 list_add(&page->lru, &ctx->io_buffers_pages);
5535
5536 buf = page_address(page);
5537 bufs_in_page = PAGE_SIZE / sizeof(*buf);
5538 while (bufs_in_page) {
5539 list_add_tail(&buf->list, &ctx->io_buffers_cache);
5540 buf++;
5541 bufs_in_page--;
5542 }
5543
5544 return 0;
5545}
5546
5547static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
dbc7d452 5548 struct io_buffer_list *bl)
ddf0322d
JA
5549{
5550 struct io_buffer *buf;
5551 u64 addr = pbuf->addr;
5552 int i, bid = pbuf->bid;
5553
5554 for (i = 0; i < pbuf->nbufs; i++) {
cc3cec83
JA
5555 if (list_empty(&ctx->io_buffers_cache) &&
5556 io_refill_buffer_cache(ctx))
ddf0322d 5557 break;
cc3cec83
JA
5558 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
5559 list);
dbc7d452 5560 list_move_tail(&buf->list, &bl->buf_list);
ddf0322d 5561 buf->addr = addr;
d1f82808 5562 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
ddf0322d 5563 buf->bid = bid;
b1c62645 5564 buf->bgid = pbuf->bgid;
ddf0322d
JA
5565 addr += pbuf->len;
5566 bid++;
f240762f 5567 cond_resched();
ddf0322d
JA
5568 }
5569
dbc7d452 5570 return i ? 0 : -ENOMEM;
ddf0322d
JA
5571}
5572
9cfc7e94
JA
5573static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
5574{
5575 int i;
5576
5577 ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
5578 GFP_KERNEL);
5579 if (!ctx->io_bl)
5580 return -ENOMEM;
5581
5582 for (i = 0; i < BGID_ARRAY; i++) {
5583 INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
5584 ctx->io_bl[i].bgid = i;
5585 }
5586
5587 return 0;
5588}
5589
889fca73 5590static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
ddf0322d
JA
5591{
5592 struct io_provide_buf *p = &req->pbuf;
5593 struct io_ring_ctx *ctx = req->ctx;
dbc7d452 5594 struct io_buffer_list *bl;
ddf0322d
JA
5595 int ret = 0;
5596
f8929630 5597 io_ring_submit_lock(ctx, issue_flags);
ddf0322d 5598
9cfc7e94
JA
5599 if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
5600 ret = io_init_bl_list(ctx);
5601 if (ret)
5602 goto err;
5603 }
ddf0322d 5604
dbc7d452
JA
5605 bl = io_buffer_get_list(ctx, p->bgid);
5606 if (unlikely(!bl)) {
5607 bl = kmalloc(sizeof(*bl), GFP_KERNEL);
5608 if (!bl) {
5609 ret = -ENOMEM;
5610 goto err;
5611 }
9cfc7e94
JA
5612 ret = io_buffer_add_list(ctx, bl, p->bgid);
5613 if (ret) {
5614 kfree(bl);
5615 goto err;
5616 }
ddf0322d 5617 }
dbc7d452
JA
5618
5619 ret = io_add_buffers(ctx, p, bl);
5620err:
ddf0322d 5621 if (ret < 0)
93d2bcd2 5622 req_set_fail(req);
9fb8cb49
PB
5623 /* complete before unlock, IOPOLL may need the lock */
5624 __io_req_complete(req, issue_flags, ret, 0);
f8929630 5625 io_ring_submit_unlock(ctx, issue_flags);
ddf0322d 5626 return 0;
cebdb986
JA
5627}
5628
3e4827b0
JA
5629static int io_epoll_ctl_prep(struct io_kiocb *req,
5630 const struct io_uring_sqe *sqe)
5631{
5632#if defined(CONFIG_EPOLL)
73911426 5633 if (sqe->buf_index || sqe->splice_fd_in)
3232dd02 5634 return -EINVAL;
3e4827b0
JA
5635
5636 req->epoll.epfd = READ_ONCE(sqe->fd);
5637 req->epoll.op = READ_ONCE(sqe->len);
5638 req->epoll.fd = READ_ONCE(sqe->off);
5639
5640 if (ep_op_has_event(req->epoll.op)) {
5641 struct epoll_event __user *ev;
5642
5643 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
5644 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
5645 return -EFAULT;
5646 }
5647
5648 return 0;
5649#else
5650 return -EOPNOTSUPP;
5651#endif
5652}
5653
889fca73 5654static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
3e4827b0
JA
5655{
5656#if defined(CONFIG_EPOLL)
5657 struct io_epoll *ie = &req->epoll;
5658 int ret;
45d189c6 5659 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3e4827b0
JA
5660
5661 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
5662 if (force_nonblock && ret == -EAGAIN)
5663 return -EAGAIN;
5664
5665 if (ret < 0)
93d2bcd2 5666 req_set_fail(req);
889fca73 5667 __io_req_complete(req, issue_flags, ret, 0);
3e4827b0
JA
5668 return 0;
5669#else
5670 return -EOPNOTSUPP;
5671#endif
5672}
5673
c1ca757b
JA
5674static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5675{
5676#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
73911426 5677 if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
3232dd02 5678 return -EINVAL;
c1ca757b
JA
5679
5680 req->madvise.addr = READ_ONCE(sqe->addr);
5681 req->madvise.len = READ_ONCE(sqe->len);
5682 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
5683 return 0;
5684#else
5685 return -EOPNOTSUPP;
5686#endif
5687}
5688
45d189c6 5689static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
c1ca757b
JA
5690{
5691#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
5692 struct io_madvise *ma = &req->madvise;
5693 int ret;
5694
45d189c6 5695 if (issue_flags & IO_URING_F_NONBLOCK)
c1ca757b
JA
5696 return -EAGAIN;
5697
0726b01e 5698 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
c1ca757b 5699 if (ret < 0)
93d2bcd2 5700 req_set_fail(req);
e1e16097 5701 io_req_complete(req, ret);
c1ca757b
JA
5702 return 0;
5703#else
5704 return -EOPNOTSUPP;
5705#endif
5706}
5707
4840e418
JA
5708static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5709{
73911426 5710 if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
3232dd02 5711 return -EINVAL;
4840e418
JA
5712
5713 req->fadvise.offset = READ_ONCE(sqe->off);
5714 req->fadvise.len = READ_ONCE(sqe->len);
5715 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
5716 return 0;
5717}
5718
45d189c6 5719static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4840e418
JA
5720{
5721 struct io_fadvise *fa = &req->fadvise;
5722 int ret;
5723
45d189c6 5724 if (issue_flags & IO_URING_F_NONBLOCK) {
3e69426d
JA
5725 switch (fa->advice) {
5726 case POSIX_FADV_NORMAL:
5727 case POSIX_FADV_RANDOM:
5728 case POSIX_FADV_SEQUENTIAL:
5729 break;
5730 default:
5731 return -EAGAIN;
5732 }
5733 }
4840e418
JA
5734
5735 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
5736 if (ret < 0)
93d2bcd2 5737 req_set_fail(req);
0bdf3398 5738 __io_req_complete(req, issue_flags, ret, 0);
4840e418
JA
5739 return 0;
5740}
5741
eddc7ef5
JA
5742static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5743{
1b6fe6e0
SR
5744 const char __user *path;
5745
73911426 5746 if (sqe->buf_index || sqe->splice_fd_in)
eddc7ef5 5747 return -EINVAL;
9c280f90 5748 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 5749 return -EBADF;
eddc7ef5 5750
1d9e1288
BM
5751 req->statx.dfd = READ_ONCE(sqe->fd);
5752 req->statx.mask = READ_ONCE(sqe->len);
1b6fe6e0 5753 path = u64_to_user_ptr(READ_ONCE(sqe->addr));
1d9e1288
BM
5754 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5755 req->statx.flags = READ_ONCE(sqe->statx_flags);
eddc7ef5 5756
1b6fe6e0
SR
5757 req->statx.filename = getname_flags(path,
5758 getname_statx_lookup_flags(req->statx.flags),
5759 NULL);
5760
5761 if (IS_ERR(req->statx.filename)) {
5762 int ret = PTR_ERR(req->statx.filename);
5763
5764 req->statx.filename = NULL;
5765 return ret;
5766 }
5767
5768 req->flags |= REQ_F_NEED_CLEANUP;
eddc7ef5
JA
5769 return 0;
5770}
5771
45d189c6 5772static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
eddc7ef5 5773{
1d9e1288 5774 struct io_statx *ctx = &req->statx;
eddc7ef5
JA
5775 int ret;
5776
59d70013 5777 if (issue_flags & IO_URING_F_NONBLOCK)
eddc7ef5
JA
5778 return -EAGAIN;
5779
e62753e4
BM
5780 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
5781 ctx->buffer);
eddc7ef5 5782
eddc7ef5 5783 if (ret < 0)
93d2bcd2 5784 req_set_fail(req);
e1e16097 5785 io_req_complete(req, ret);
eddc7ef5
JA
5786 return 0;
5787}
5788
b5dba59e
JA
5789static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5790{
73911426 5791 if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
b5dba59e 5792 return -EINVAL;
9c280f90 5793 if (req->flags & REQ_F_FIXED_FILE)
cf3040ca 5794 return -EBADF;
b5dba59e
JA
5795
5796 req->close.fd = READ_ONCE(sqe->fd);
7df778be
PB
5797 req->close.file_slot = READ_ONCE(sqe->file_index);
5798 if (req->close.file_slot && req->close.fd)
5799 return -EINVAL;
5800
b5dba59e 5801 return 0;
b5dba59e
JA
5802}
5803
889fca73 5804static int io_close(struct io_kiocb *req, unsigned int issue_flags)
b5dba59e 5805{
9eac1904 5806 struct files_struct *files = current->files;
3af73b28 5807 struct io_close *close = &req->close;
9eac1904 5808 struct fdtable *fdt;
a1fde923
PB
5809 struct file *file = NULL;
5810 int ret = -EBADF;
b5dba59e 5811
7df778be
PB
5812 if (req->close.file_slot) {
5813 ret = io_close_fixed(req, issue_flags);
5814 goto err;
5815 }
5816
9eac1904
JA
5817 spin_lock(&files->file_lock);
5818 fdt = files_fdtable(files);
5819 if (close->fd >= fdt->max_fds) {
5820 spin_unlock(&files->file_lock);
5821 goto err;
5822 }
5823 file = fdt->fd[close->fd];
a1fde923 5824 if (!file || file->f_op == &io_uring_fops) {
9eac1904
JA
5825 spin_unlock(&files->file_lock);
5826 file = NULL;
5827 goto err;
3af73b28 5828 }
b5dba59e
JA
5829
5830 /* if the file has a flush method, be safe and punt to async */
45d189c6 5831 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
9eac1904 5832 spin_unlock(&files->file_lock);
0bf0eefd 5833 return -EAGAIN;
a2100672 5834 }
b5dba59e 5835
9eac1904
JA
5836 ret = __close_fd_get_file(close->fd, &file);
5837 spin_unlock(&files->file_lock);
5838 if (ret < 0) {
5839 if (ret == -ENOENT)
5840 ret = -EBADF;
5841 goto err;
5842 }
5843
3af73b28 5844 /* No ->flush() or already async, safely close from here */
9eac1904
JA
5845 ret = filp_close(file, current->files);
5846err:
3af73b28 5847 if (ret < 0)
93d2bcd2 5848 req_set_fail(req);
9eac1904
JA
5849 if (file)
5850 fput(file);
889fca73 5851 __io_req_complete(req, issue_flags, ret, 0);
1a417f4e 5852 return 0;
b5dba59e
JA
5853}
5854
1155c76a 5855static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5d17b4a4 5856{
73911426 5857 if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
5d17b4a4
JA
5858 return -EINVAL;
5859
8ed8d3c3
JA
5860 req->sync.off = READ_ONCE(sqe->off);
5861 req->sync.len = READ_ONCE(sqe->len);
5862 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
8ed8d3c3
JA
5863 return 0;
5864}
5865
45d189c6 5866static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3 5867{
8ed8d3c3
JA
5868 int ret;
5869
ac45abc0 5870 /* sync_file_range always requires a blocking context */
45d189c6 5871 if (issue_flags & IO_URING_F_NONBLOCK)
ac45abc0
PB
5872 return -EAGAIN;
5873
9adbd45d 5874 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
8ed8d3c3
JA
5875 req->sync.flags);
5876 if (ret < 0)
93d2bcd2 5877 req_set_fail(req);
e1e16097 5878 io_req_complete(req, ret);
5d17b4a4
JA
5879 return 0;
5880}
5881
469956e8 5882#if defined(CONFIG_NET)
4c3c0943
JA
5883static bool io_net_retry(struct socket *sock, int flags)
5884{
5885 if (!(flags & MSG_WAITALL))
5886 return false;
5887 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
5888}
5889
02d27d89
PB
5890static int io_setup_async_msg(struct io_kiocb *req,
5891 struct io_async_msghdr *kmsg)
5892{
e8c2bc1f
JA
5893 struct io_async_msghdr *async_msg = req->async_data;
5894
5895 if (async_msg)
02d27d89 5896 return -EAGAIN;
e8c2bc1f 5897 if (io_alloc_async_data(req)) {
257e84a5 5898 kfree(kmsg->free_iov);
02d27d89
PB
5899 return -ENOMEM;
5900 }
e8c2bc1f 5901 async_msg = req->async_data;
02d27d89 5902 req->flags |= REQ_F_NEED_CLEANUP;
e8c2bc1f 5903 memcpy(async_msg, kmsg, sizeof(*kmsg));
2a780802 5904 async_msg->msg.msg_name = &async_msg->addr;
257e84a5
PB
5905 /* if were using fast_iov, set it to the new one */
5906 if (!async_msg->free_iov)
5907 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
5908
02d27d89
PB
5909 return -EAGAIN;
5910}
5911
2ae523ed
PB
5912static int io_sendmsg_copy_hdr(struct io_kiocb *req,
5913 struct io_async_msghdr *iomsg)
5914{
2ae523ed 5915 iomsg->msg.msg_name = &iomsg->addr;
257e84a5 5916 iomsg->free_iov = iomsg->fast_iov;
2ae523ed 5917 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
257e84a5 5918 req->sr_msg.msg_flags, &iomsg->free_iov);
2ae523ed
PB
5919}
5920
93642ef8
PB
5921static int io_sendmsg_prep_async(struct io_kiocb *req)
5922{
5923 int ret;
5924
93642ef8
PB
5925 ret = io_sendmsg_copy_hdr(req, req->async_data);
5926 if (!ret)
5927 req->flags |= REQ_F_NEED_CLEANUP;
5928 return ret;
5929}
5930
3529d8c2 5931static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
03b1230c 5932{
e47293fd 5933 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 5934
0455d4cc 5935 if (unlikely(sqe->file_index))
d2b6f48b 5936 return -EINVAL;
588faa1e
JA
5937 if (unlikely(sqe->addr2 || sqe->file_index))
5938 return -EINVAL;
d2b6f48b 5939
270a5940 5940 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
fddaface 5941 sr->len = READ_ONCE(sqe->len);
0455d4cc
JA
5942 sr->flags = READ_ONCE(sqe->addr2);
5943 if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
5944 return -EINVAL;
04411806
PB
5945 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5946 if (sr->msg_flags & MSG_DONTWAIT)
5947 req->flags |= REQ_F_NOWAIT;
3529d8c2 5948
d8768362
JA
5949#ifdef CONFIG_COMPAT
5950 if (req->ctx->compat)
5951 sr->msg_flags |= MSG_CMSG_COMPAT;
5952#endif
4c3c0943 5953 sr->done_io = 0;
93642ef8 5954 return 0;
03b1230c
JA
5955}
5956
889fca73 5957static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 5958{
6b754c8b 5959 struct io_async_msghdr iomsg, *kmsg;
4c3c0943 5960 struct io_sr_msg *sr = &req->sr_msg;
0fa03c62 5961 struct socket *sock;
7a7cacba 5962 unsigned flags;
0031275d 5963 int min_ret = 0;
0fa03c62
JA
5964 int ret;
5965
dba4a925 5966 sock = sock_from_file(req->file);
7a7cacba 5967 if (unlikely(!sock))
dba4a925 5968 return -ENOTSOCK;
3529d8c2 5969
d886e185
PB
5970 if (req_has_async_data(req)) {
5971 kmsg = req->async_data;
5972 } else {
7a7cacba
PB
5973 ret = io_sendmsg_copy_hdr(req, &iomsg);
5974 if (ret)
5975 return ret;
5976 kmsg = &iomsg;
0fa03c62 5977 }
0fa03c62 5978
0455d4cc
JA
5979 if (!(req->flags & REQ_F_POLLED) &&
5980 (sr->flags & IORING_RECVSEND_POLL_FIRST))
5981 return io_setup_async_msg(req, kmsg);
5982
0a352aaa 5983 flags = sr->msg_flags;
04411806 5984 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 5985 flags |= MSG_DONTWAIT;
0031275d
SM
5986 if (flags & MSG_WAITALL)
5987 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5988
7a7cacba 5989 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
0fa03c62 5990
7297ce3d
PB
5991 if (ret < min_ret) {
5992 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
5993 return io_setup_async_msg(req, kmsg);
5994 if (ret == -ERESTARTSYS)
5995 ret = -EINTR;
4c3c0943
JA
5996 if (ret > 0 && io_net_retry(sock, flags)) {
5997 sr->done_io += ret;
5998 req->flags |= REQ_F_PARTIAL_IO;
5999 return io_setup_async_msg(req, kmsg);
6000 }
7297ce3d
PB
6001 req_set_fail(req);
6002 }
257e84a5
PB
6003 /* fast path, check for non-NULL to avoid function call */
6004 if (kmsg->free_iov)
6005 kfree(kmsg->free_iov);
99bc4c38 6006 req->flags &= ~REQ_F_NEED_CLEANUP;
4c3c0943
JA
6007 if (ret >= 0)
6008 ret += sr->done_io;
6009 else if (sr->done_io)
6010 ret = sr->done_io;
889fca73 6011 __io_req_complete(req, issue_flags, ret, 0);
5d17b4a4 6012 return 0;
03b1230c 6013}
aa1fa28f 6014
889fca73 6015static int io_send(struct io_kiocb *req, unsigned int issue_flags)
fddaface 6016{
7a7cacba
PB
6017 struct io_sr_msg *sr = &req->sr_msg;
6018 struct msghdr msg;
6019 struct iovec iov;
fddaface 6020 struct socket *sock;
7a7cacba 6021 unsigned flags;
0031275d 6022 int min_ret = 0;
fddaface
JA
6023 int ret;
6024
0455d4cc
JA
6025 if (!(req->flags & REQ_F_POLLED) &&
6026 (sr->flags & IORING_RECVSEND_POLL_FIRST))
6027 return -EAGAIN;
6028
dba4a925 6029 sock = sock_from_file(req->file);
7a7cacba 6030 if (unlikely(!sock))
dba4a925 6031 return -ENOTSOCK;
fddaface 6032
7a7cacba
PB
6033 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
6034 if (unlikely(ret))
14db8411 6035 return ret;
fddaface 6036
7a7cacba
PB
6037 msg.msg_name = NULL;
6038 msg.msg_control = NULL;
6039 msg.msg_controllen = 0;
6040 msg.msg_namelen = 0;
fddaface 6041
0a352aaa 6042 flags = sr->msg_flags;
04411806 6043 if (issue_flags & IO_URING_F_NONBLOCK)
7a7cacba 6044 flags |= MSG_DONTWAIT;
0031275d
SM
6045 if (flags & MSG_WAITALL)
6046 min_ret = iov_iter_count(&msg.msg_iter);
6047
7a7cacba
PB
6048 msg.msg_flags = flags;
6049 ret = sock_sendmsg(sock, &msg);
7297ce3d
PB
6050 if (ret < min_ret) {
6051 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
6052 return -EAGAIN;
6053 if (ret == -ERESTARTSYS)
6054 ret = -EINTR;
4c3c0943
JA
6055 if (ret > 0 && io_net_retry(sock, flags)) {
6056 sr->len -= ret;
6057 sr->buf += ret;
6058 sr->done_io += ret;
6059 req->flags |= REQ_F_PARTIAL_IO;
6060 return -EAGAIN;
6061 }
93d2bcd2 6062 req_set_fail(req);
7297ce3d 6063 }
4c3c0943
JA
6064 if (ret >= 0)
6065 ret += sr->done_io;
6066 else if (sr->done_io)
6067 ret = sr->done_io;
889fca73 6068 __io_req_complete(req, issue_flags, ret, 0);
fddaface 6069 return 0;
fddaface
JA
6070}
6071
1400e697
PB
6072static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
6073 struct io_async_msghdr *iomsg)
52de1fe1
JA
6074{
6075 struct io_sr_msg *sr = &req->sr_msg;
6076 struct iovec __user *uiov;
6077 size_t iov_len;
6078 int ret;
6079
1400e697
PB
6080 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
6081 &iomsg->uaddr, &uiov, &iov_len);
52de1fe1
JA
6082 if (ret)
6083 return ret;
6084
6085 if (req->flags & REQ_F_BUFFER_SELECT) {
6086 if (iov_len > 1)
6087 return -EINVAL;
5476dfed 6088 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
52de1fe1 6089 return -EFAULT;
5476dfed 6090 sr->len = iomsg->fast_iov[0].iov_len;
257e84a5 6091 iomsg->free_iov = NULL;
52de1fe1 6092 } else {
257e84a5 6093 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 6094 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
257e84a5 6095 &iomsg->free_iov, &iomsg->msg.msg_iter,
89cd35c5 6096 false);
52de1fe1
JA
6097 if (ret > 0)
6098 ret = 0;
6099 }
6100
6101 return ret;
6102}
6103
6104#ifdef CONFIG_COMPAT
6105static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
1400e697 6106 struct io_async_msghdr *iomsg)
52de1fe1 6107{
52de1fe1
JA
6108 struct io_sr_msg *sr = &req->sr_msg;
6109 struct compat_iovec __user *uiov;
6110 compat_uptr_t ptr;
6111 compat_size_t len;
6112 int ret;
6113
4af3417a
PB
6114 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
6115 &ptr, &len);
52de1fe1
JA
6116 if (ret)
6117 return ret;
6118
6119 uiov = compat_ptr(ptr);
6120 if (req->flags & REQ_F_BUFFER_SELECT) {
6121 compat_ssize_t clen;
6122
6123 if (len > 1)
6124 return -EINVAL;
6125 if (!access_ok(uiov, sizeof(*uiov)))
6126 return -EFAULT;
6127 if (__get_user(clen, &uiov->iov_len))
6128 return -EFAULT;
6129 if (clen < 0)
6130 return -EINVAL;
2d280bc8 6131 sr->len = clen;
257e84a5 6132 iomsg->free_iov = NULL;
52de1fe1 6133 } else {
257e84a5 6134 iomsg->free_iov = iomsg->fast_iov;
89cd35c5 6135 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
257e84a5 6136 UIO_FASTIOV, &iomsg->free_iov,
89cd35c5 6137 &iomsg->msg.msg_iter, true);
52de1fe1
JA
6138 if (ret < 0)
6139 return ret;
6140 }
6141
6142 return 0;
6143}
6144#endif
6145
1400e697
PB
6146static int io_recvmsg_copy_hdr(struct io_kiocb *req,
6147 struct io_async_msghdr *iomsg)
52de1fe1 6148{
1400e697 6149 iomsg->msg.msg_name = &iomsg->addr;
52de1fe1
JA
6150
6151#ifdef CONFIG_COMPAT
6152 if (req->ctx->compat)
1400e697 6153 return __io_compat_recvmsg_copy_hdr(req, iomsg);
fddaface 6154#endif
52de1fe1 6155
1400e697 6156 return __io_recvmsg_copy_hdr(req, iomsg);
52de1fe1
JA
6157}
6158
93642ef8 6159static int io_recvmsg_prep_async(struct io_kiocb *req)
aa1fa28f 6160{
99bc4c38 6161 int ret;
3529d8c2 6162
93642ef8
PB
6163 ret = io_recvmsg_copy_hdr(req, req->async_data);
6164 if (!ret)
6165 req->flags |= REQ_F_NEED_CLEANUP;
6166 return ret;
6167}
6168
6169static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6170{
6171 struct io_sr_msg *sr = &req->sr_msg;
6172
0455d4cc 6173 if (unlikely(sqe->file_index))
d2b6f48b 6174 return -EINVAL;
5a1e99b6
JA
6175 if (unlikely(sqe->addr2 || sqe->file_index))
6176 return -EINVAL;
d2b6f48b 6177
270a5940 6178 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
0b7b21e4 6179 sr->len = READ_ONCE(sqe->len);
0455d4cc
JA
6180 sr->flags = READ_ONCE(sqe->addr2);
6181 if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
6182 return -EINVAL;
04411806
PB
6183 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
6184 if (sr->msg_flags & MSG_DONTWAIT)
6185 req->flags |= REQ_F_NOWAIT;
06b76d44 6186
d8768362
JA
6187#ifdef CONFIG_COMPAT
6188 if (req->ctx->compat)
6189 sr->msg_flags |= MSG_CMSG_COMPAT;
6190#endif
7ba89d2a 6191 sr->done_io = 0;
93642ef8 6192 return 0;
aa1fa28f
JA
6193}
6194
889fca73 6195static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
aa1fa28f 6196{
6b754c8b 6197 struct io_async_msghdr iomsg, *kmsg;
7ba89d2a 6198 struct io_sr_msg *sr = &req->sr_msg;
03b1230c 6199 struct socket *sock;
7a7cacba 6200 unsigned flags;
d1fd1c20 6201 int ret, min_ret = 0;
45d189c6 6202 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
03b1230c 6203
dba4a925 6204 sock = sock_from_file(req->file);
7a7cacba 6205 if (unlikely(!sock))
dba4a925 6206 return -ENOTSOCK;
3529d8c2 6207
d886e185
PB
6208 if (req_has_async_data(req)) {
6209 kmsg = req->async_data;
6210 } else {
7a7cacba
PB
6211 ret = io_recvmsg_copy_hdr(req, &iomsg);
6212 if (ret)
681fda8d 6213 return ret;
7a7cacba
PB
6214 kmsg = &iomsg;
6215 }
03b1230c 6216
0455d4cc
JA
6217 if (!(req->flags & REQ_F_POLLED) &&
6218 (sr->flags & IORING_RECVSEND_POLL_FIRST))
6219 return io_setup_async_msg(req, kmsg);
6220
b66e65f4 6221 if (io_do_buffer_select(req)) {
c54d52c2
JA
6222 void __user *buf;
6223
4e906702 6224 buf = io_buffer_select(req, &sr->len, issue_flags);
c54d52c2
JA
6225 if (IS_ERR(buf))
6226 return PTR_ERR(buf);
6227 kmsg->fast_iov[0].iov_base = buf;
0a352aaa
JA
6228 kmsg->fast_iov[0].iov_len = sr->len;
6229 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
6230 sr->len);
7a7cacba 6231 }
52de1fe1 6232
0a352aaa 6233 flags = sr->msg_flags;
04411806 6234 if (force_nonblock)
7a7cacba 6235 flags |= MSG_DONTWAIT;
0031275d
SM
6236 if (flags & MSG_WAITALL)
6237 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
6238
0a352aaa 6239 ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
7297ce3d
PB
6240 if (ret < min_ret) {
6241 if (ret == -EAGAIN && force_nonblock)
6242 return io_setup_async_msg(req, kmsg);
6243 if (ret == -ERESTARTSYS)
6244 ret = -EINTR;
7ba89d2a
JA
6245 if (ret > 0 && io_net_retry(sock, flags)) {
6246 sr->done_io += ret;
8a3e8ee5 6247 req->flags |= REQ_F_PARTIAL_IO;
7ba89d2a
JA
6248 return io_setup_async_msg(req, kmsg);
6249 }
7297ce3d
PB
6250 req_set_fail(req);
6251 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
6252 req_set_fail(req);
6253 }
03b1230c 6254
257e84a5
PB
6255 /* fast path, check for non-NULL to avoid function call */
6256 if (kmsg->free_iov)
6257 kfree(kmsg->free_iov);
99bc4c38 6258 req->flags &= ~REQ_F_NEED_CLEANUP;
7ba89d2a
JA
6259 if (ret >= 0)
6260 ret += sr->done_io;
6261 else if (sr->done_io)
6262 ret = sr->done_io;
cc3cec83 6263 __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
03b1230c 6264 return 0;
0fa03c62 6265}
5d17b4a4 6266
889fca73 6267static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
fddaface 6268{
7a7cacba
PB
6269 struct io_sr_msg *sr = &req->sr_msg;
6270 struct msghdr msg;
fddaface 6271 struct socket *sock;
7a7cacba
PB
6272 struct iovec iov;
6273 unsigned flags;
d1fd1c20 6274 int ret, min_ret = 0;
45d189c6 6275 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
fddaface 6276
0455d4cc
JA
6277 if (!(req->flags & REQ_F_POLLED) &&
6278 (sr->flags & IORING_RECVSEND_POLL_FIRST))
6279 return -EAGAIN;
6280
dba4a925 6281 sock = sock_from_file(req->file);
7a7cacba 6282 if (unlikely(!sock))
dba4a925 6283 return -ENOTSOCK;
fddaface 6284
b66e65f4 6285 if (io_do_buffer_select(req)) {
c54d52c2
JA
6286 void __user *buf;
6287
4e906702 6288 buf = io_buffer_select(req, &sr->len, issue_flags);
c54d52c2
JA
6289 if (IS_ERR(buf))
6290 return PTR_ERR(buf);
6291 sr->buf = buf;
bc02ef33 6292 }
bcda7baa 6293
c54d52c2 6294 ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
14c32eee
PB
6295 if (unlikely(ret))
6296 goto out_free;
fddaface 6297
7a7cacba
PB
6298 msg.msg_name = NULL;
6299 msg.msg_control = NULL;
6300 msg.msg_controllen = 0;
6301 msg.msg_namelen = 0;
6302 msg.msg_iocb = NULL;
6303 msg.msg_flags = 0;
fddaface 6304
0a352aaa 6305 flags = sr->msg_flags;
04411806 6306 if (force_nonblock)
7a7cacba 6307 flags |= MSG_DONTWAIT;
0031275d
SM
6308 if (flags & MSG_WAITALL)
6309 min_ret = iov_iter_count(&msg.msg_iter);
6310
7a7cacba 6311 ret = sock_recvmsg(sock, &msg, flags);
7297ce3d
PB
6312 if (ret < min_ret) {
6313 if (ret == -EAGAIN && force_nonblock)
6314 return -EAGAIN;
6315 if (ret == -ERESTARTSYS)
6316 ret = -EINTR;
7ba89d2a
JA
6317 if (ret > 0 && io_net_retry(sock, flags)) {
6318 sr->len -= ret;
6319 sr->buf += ret;
6320 sr->done_io += ret;
8a3e8ee5 6321 req->flags |= REQ_F_PARTIAL_IO;
7ba89d2a
JA
6322 return -EAGAIN;
6323 }
7297ce3d
PB
6324 req_set_fail(req);
6325 } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
0d7c1153 6326out_free:
93d2bcd2 6327 req_set_fail(req);
7297ce3d 6328 }
cc3cec83 6329
7ba89d2a
JA
6330 if (ret >= 0)
6331 ret += sr->done_io;
6332 else if (sr->done_io)
6333 ret = sr->done_io;
cc3cec83 6334 __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
fddaface 6335 return 0;
fddaface
JA
6336}
6337
3529d8c2 6338static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
17f2fe35 6339{
8ed8d3c3
JA
6340 struct io_accept *accept = &req->accept;
6341
73911426 6342 if (sqe->len || sqe->buf_index)
17f2fe35
JA
6343 return -EINVAL;
6344
d55e5f5b
JA
6345 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
6346 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
8ed8d3c3 6347 accept->flags = READ_ONCE(sqe->accept_flags);
09952e3e 6348 accept->nofile = rlimit(RLIMIT_NOFILE);
a7083ad5 6349
aaa4db12 6350 accept->file_slot = READ_ONCE(sqe->file_index);
adf3a9e9 6351 if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
aaa4db12 6352 return -EINVAL;
a7083ad5
PB
6353 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
6354 return -EINVAL;
6355 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
6356 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
8ed8d3c3 6357 return 0;
8ed8d3c3 6358}
17f2fe35 6359
889fca73 6360static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
8ed8d3c3
JA
6361{
6362 struct io_accept *accept = &req->accept;
45d189c6 6363 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ac45abc0 6364 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
aaa4db12 6365 bool fixed = !!accept->file_slot;
a7083ad5
PB
6366 struct file *file;
6367 int ret, fd;
8ed8d3c3 6368
aaa4db12
PB
6369 if (!fixed) {
6370 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
6371 if (unlikely(fd < 0))
6372 return fd;
6373 }
a7083ad5
PB
6374 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
6375 accept->flags);
6376 if (IS_ERR(file)) {
aaa4db12
PB
6377 if (!fixed)
6378 put_unused_fd(fd);
a7083ad5
PB
6379 ret = PTR_ERR(file);
6380 if (ret == -EAGAIN && force_nonblock)
6381 return -EAGAIN;
ac45abc0
PB
6382 if (ret == -ERESTARTSYS)
6383 ret = -EINTR;
93d2bcd2 6384 req_set_fail(req);
aaa4db12 6385 } else if (!fixed) {
a7083ad5
PB
6386 fd_install(fd, file);
6387 ret = fd;
aaa4db12
PB
6388 } else {
6389 ret = io_install_fixed_file(req, file, issue_flags,
6390 accept->file_slot - 1);
ac45abc0 6391 }
889fca73 6392 __io_req_complete(req, issue_flags, ret, 0);
17f2fe35 6393 return 0;
8ed8d3c3
JA
6394}
6395
1374e08e
JA
6396static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6397{
6398 struct io_socket *sock = &req->sock;
6399
ee692a21 6400 if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1374e08e
JA
6401 return -EINVAL;
6402
6403 sock->domain = READ_ONCE(sqe->fd);
6404 sock->type = READ_ONCE(sqe->off);
6405 sock->protocol = READ_ONCE(sqe->len);
6406 sock->file_slot = READ_ONCE(sqe->file_index);
6407 sock->nofile = rlimit(RLIMIT_NOFILE);
6408
6409 sock->flags = sock->type & ~SOCK_TYPE_MASK;
6410 if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
6411 return -EINVAL;
6412 if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
6413 return -EINVAL;
6414 return 0;
6415}
6416
6417static int io_socket(struct io_kiocb *req, unsigned int issue_flags)
6418{
6419 struct io_socket *sock = &req->sock;
6420 bool fixed = !!sock->file_slot;
6421 struct file *file;
6422 int ret, fd;
6423
6424 if (!fixed) {
6425 fd = __get_unused_fd_flags(sock->flags, sock->nofile);
6426 if (unlikely(fd < 0))
6427 return fd;
6428 }
6429 file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
6430 if (IS_ERR(file)) {
6431 if (!fixed)
6432 put_unused_fd(fd);
6433 ret = PTR_ERR(file);
6434 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
6435 return -EAGAIN;
6436 if (ret == -ERESTARTSYS)
6437 ret = -EINTR;
6438 req_set_fail(req);
6439 } else if (!fixed) {
6440 fd_install(fd, file);
6441 ret = fd;
6442 } else {
6443 ret = io_install_fixed_file(req, file, issue_flags,
6444 sock->file_slot - 1);
6445 }
6446 __io_req_complete(req, issue_flags, ret, 0);
6447 return 0;
6448}
6449
93642ef8
PB
6450static int io_connect_prep_async(struct io_kiocb *req)
6451{
6452 struct io_async_connect *io = req->async_data;
6453 struct io_connect *conn = &req->connect;
6454
6455 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
6456}
6457
3529d8c2 6458static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f499a021 6459{
3529d8c2 6460 struct io_connect *conn = &req->connect;
f499a021 6461
73911426 6462 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
3fbb51c1
JA
6463 return -EINVAL;
6464
3529d8c2
JA
6465 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
6466 conn->addr_len = READ_ONCE(sqe->addr2);
93642ef8 6467 return 0;
f499a021
JA
6468}
6469
889fca73 6470static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
f8e85cf2 6471{
e8c2bc1f 6472 struct io_async_connect __io, *io;
f8e85cf2 6473 unsigned file_flags;
3fbb51c1 6474 int ret;
45d189c6 6475 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
f8e85cf2 6476
d886e185 6477 if (req_has_async_data(req)) {
e8c2bc1f 6478 io = req->async_data;
f499a021 6479 } else {
3529d8c2
JA
6480 ret = move_addr_to_kernel(req->connect.addr,
6481 req->connect.addr_len,
e8c2bc1f 6482 &__io.address);
f499a021
JA
6483 if (ret)
6484 goto out;
6485 io = &__io;
6486 }
6487
3fbb51c1
JA
6488 file_flags = force_nonblock ? O_NONBLOCK : 0;
6489
e8c2bc1f 6490 ret = __sys_connect_file(req->file, &io->address,
3fbb51c1 6491 req->connect.addr_len, file_flags);
87f80d62 6492 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
d886e185 6493 if (req_has_async_data(req))
b7bb4f7d 6494 return -EAGAIN;
e8c2bc1f 6495 if (io_alloc_async_data(req)) {
f499a021
JA
6496 ret = -ENOMEM;
6497 goto out;
6498 }
e8c2bc1f 6499 memcpy(req->async_data, &__io, sizeof(__io));
f8e85cf2 6500 return -EAGAIN;
f499a021 6501 }
f8e85cf2
JA
6502 if (ret == -ERESTARTSYS)
6503 ret = -EINTR;
f499a021 6504out:
4e88d6e7 6505 if (ret < 0)
93d2bcd2 6506 req_set_fail(req);
889fca73 6507 __io_req_complete(req, issue_flags, ret, 0);
f8e85cf2 6508 return 0;
469956e8
Y
6509}
6510#else /* !CONFIG_NET */
99a10081
JA
6511#define IO_NETOP_FN(op) \
6512static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
6513{ \
6514 return -EOPNOTSUPP; \
6515}
6516
6517#define IO_NETOP_PREP(op) \
6518IO_NETOP_FN(op) \
6519static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
6520{ \
6521 return -EOPNOTSUPP; \
6522} \
6523
6524#define IO_NETOP_PREP_ASYNC(op) \
6525IO_NETOP_PREP(op) \
6526static int io_##op##_prep_async(struct io_kiocb *req) \
6527{ \
6528 return -EOPNOTSUPP; \
6529}
6530
6531IO_NETOP_PREP_ASYNC(sendmsg);
6532IO_NETOP_PREP_ASYNC(recvmsg);
6533IO_NETOP_PREP_ASYNC(connect);
6534IO_NETOP_PREP(accept);
1374e08e 6535IO_NETOP_PREP(socket);
99a10081
JA
6536IO_NETOP_FN(send);
6537IO_NETOP_FN(recv);
469956e8 6538#endif /* CONFIG_NET */
f8e85cf2 6539
d7718a9d
JA
6540struct io_poll_table {
6541 struct poll_table_struct pt;
6542 struct io_kiocb *req;
68b11e8b 6543 int nr_entries;
d7718a9d
JA
6544 int error;
6545};
ce593a6c 6546
aa43477b 6547#define IO_POLL_CANCEL_FLAG BIT(31)
e2c0cb7c 6548#define IO_POLL_REF_MASK GENMASK(30, 0)
6d816e08 6549
aa43477b
PB
6550/*
6551 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
6552 * bump it and acquire ownership. It's disallowed to modify requests while not
6553 * owning it, that prevents from races for enqueueing task_work's and b/w
6554 * arming poll and wakeups.
6555 */
6556static inline bool io_poll_get_ownership(struct io_kiocb *req)
6557{
6558 return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
d7718a9d
JA
6559}
6560
aa43477b 6561static void io_poll_mark_cancelled(struct io_kiocb *req)
74ce6ce4 6562{
aa43477b 6563 atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
74ce6ce4
JA
6564}
6565
d4e7cd36 6566static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
18bceab1 6567{
e8c2bc1f 6568 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
d4e7cd36 6569 if (req->opcode == IORING_OP_POLL_ADD)
e8c2bc1f 6570 return req->async_data;
d4e7cd36
JA
6571 return req->apoll->double_poll;
6572}
6573
6574static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
6575{
6576 if (req->opcode == IORING_OP_POLL_ADD)
6577 return &req->poll;
6578 return &req->apoll->poll;
6579}
6580
5641897a 6581static void io_poll_req_insert(struct io_kiocb *req)
d4e7cd36 6582{
5641897a
PB
6583 struct io_ring_ctx *ctx = req->ctx;
6584 struct hlist_head *list;
18bceab1 6585
cef216fc 6586 list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)];
5641897a 6587 hlist_add_head(&req->hash_node, list);
18bceab1
JA
6588}
6589
5641897a
PB
6590static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
6591 wait_queue_func_t wake_func)
18bceab1 6592{
5641897a 6593 poll->head = NULL;
5641897a
PB
6594#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
6595 /* mask in events that we always want/need */
6596 poll->events = events | IO_POLL_UNMASK;
6597 INIT_LIST_HEAD(&poll->wait.entry);
6598 init_waitqueue_func_entry(&poll->wait, wake_func);
18bceab1
JA
6599}
6600
aa43477b 6601static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
18bceab1 6602{
791f3465 6603 struct wait_queue_head *head = smp_load_acquire(&poll->head);
18bceab1 6604
791f3465
PB
6605 if (head) {
6606 spin_lock_irq(&head->lock);
6607 list_del_init(&poll->wait.entry);
6608 poll->head = NULL;
6609 spin_unlock_irq(&head->lock);
6610 }
aa43477b 6611}
18bceab1 6612
aa43477b
PB
6613static void io_poll_remove_entries(struct io_kiocb *req)
6614{
91eac1c6
JA
6615 /*
6616 * Nothing to do if neither of those flags are set. Avoid dipping
6617 * into the poll/apoll/double cachelines if we can.
6618 */
6619 if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
6620 return;
18bceab1 6621
791f3465
PB
6622 /*
6623 * While we hold the waitqueue lock and the waitqueue is nonempty,
6624 * wake_up_pollfree() will wait for us. However, taking the waitqueue
6625 * lock in the first place can race with the waitqueue being freed.
6626 *
6627 * We solve this as eventpoll does: by taking advantage of the fact that
6628 * all users of wake_up_pollfree() will RCU-delay the actual free. If
6629 * we enter rcu_read_lock() and see that the pointer to the queue is
6630 * non-NULL, we can then lock it without the memory being freed out from
6631 * under us.
6632 *
6633 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
6634 * case the caller deletes the entry from the queue, leaving it empty.
6635 * In that case, only RCU prevents the queue memory from being freed.
6636 */
6637 rcu_read_lock();
91eac1c6
JA
6638 if (req->flags & REQ_F_SINGLE_POLL)
6639 io_poll_remove_entry(io_poll_get_single(req));
6640 if (req->flags & REQ_F_DOUBLE_POLL)
6641 io_poll_remove_entry(io_poll_get_double(req));
791f3465 6642 rcu_read_unlock();
18bceab1
JA
6643}
6644
aa43477b
PB
6645/*
6646 * All poll tw should go through this. Checks for poll events, manages
6647 * references, does rewait, etc.
6648 *
6649 * Returns a negative error on failure. >0 when no action require, which is
6650 * either spurious wakeup or multishot CQE is served. 0 when it's done with
cef216fc 6651 * the request, then the mask is stored in req->cqe.res.
aa43477b 6652 */
5106dd6e 6653static int io_poll_check_events(struct io_kiocb *req, bool locked)
18bceab1 6654{
74ce6ce4 6655 struct io_ring_ctx *ctx = req->ctx;
aa43477b 6656 int v;
18bceab1 6657
316319e8 6658 /* req->task == current here, checking PF_EXITING is safe */
e09ee510 6659 if (unlikely(req->task->flags & PF_EXITING))
f2219057 6660 return -ECANCELED;
18bceab1 6661
aa43477b
PB
6662 do {
6663 v = atomic_read(&req->poll_refs);
74ce6ce4 6664
aa43477b
PB
6665 /* tw handler should be the owner, and so have some references */
6666 if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
6667 return 0;
6668 if (v & IO_POLL_CANCEL_FLAG)
6669 return -ECANCELED;
8706e04e 6670
cef216fc 6671 if (!req->cqe.res) {
2804ecd8 6672 struct poll_table_struct pt = { ._key = req->apoll_events };
cce64ef0 6673 unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED;
18bceab1 6674
cce64ef0 6675 if (unlikely(!io_assign_file(req, flags)))
7179c3ce 6676 return -EBADF;
cef216fc 6677 req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
c8b5e260 6678 }
74ce6ce4 6679
aa43477b 6680 /* multishot, just fill an CQE and proceed */
cef216fc
PB
6681 if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) {
6682 __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events);
aa43477b 6683 bool filled;
18bceab1 6684
aa43477b 6685 spin_lock(&ctx->completion_lock);
cef216fc 6686 filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask,
aa43477b
PB
6687 IORING_CQE_F_MORE);
6688 io_commit_cqring(ctx);
6689 spin_unlock(&ctx->completion_lock);
6690 if (unlikely(!filled))
6691 return -ECANCELED;
6692 io_cqring_ev_posted(ctx);
cef216fc 6693 } else if (req->cqe.res) {
aa43477b
PB
6694 return 0;
6695 }
18bceab1 6696
aa43477b
PB
6697 /*
6698 * Release all references, retry if someone tried to restart
6699 * task_work while we were executing it.
6700 */
6701 } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
18bceab1 6702
18bceab1
JA
6703 return 1;
6704}
6705
aa43477b 6706static void io_poll_task_func(struct io_kiocb *req, bool *locked)
18bceab1 6707{
18bceab1 6708 struct io_ring_ctx *ctx = req->ctx;
aa43477b 6709 int ret;
18bceab1 6710
5106dd6e 6711 ret = io_poll_check_events(req, *locked);
aa43477b
PB
6712 if (ret > 0)
6713 return;
6714
6715 if (!ret) {
cef216fc 6716 req->cqe.res = mangle_poll(req->cqe.res & req->poll.events);
e27414be 6717 } else {
cef216fc 6718 req->cqe.res = ret;
aa43477b 6719 req_set_fail(req);
a62682f9 6720 }
aa43477b
PB
6721
6722 io_poll_remove_entries(req);
6723 spin_lock(&ctx->completion_lock);
6724 hash_del(&req->hash_node);
cef216fc 6725 __io_req_complete_post(req, req->cqe.res, 0);
aa43477b
PB
6726 io_commit_cqring(ctx);
6727 spin_unlock(&ctx->completion_lock);
6728 io_cqring_ev_posted(ctx);
18bceab1
JA
6729}
6730
aa43477b 6731static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
18bceab1
JA
6732{
6733 struct io_ring_ctx *ctx = req->ctx;
aa43477b 6734 int ret;
18bceab1 6735
5106dd6e 6736 ret = io_poll_check_events(req, *locked);
aa43477b
PB
6737 if (ret > 0)
6738 return;
18bceab1 6739
aa43477b
PB
6740 io_poll_remove_entries(req);
6741 spin_lock(&ctx->completion_lock);
6742 hash_del(&req->hash_node);
6743 spin_unlock(&ctx->completion_lock);
18bceab1 6744
aa43477b
PB
6745 if (!ret)
6746 io_req_task_submit(req, locked);
6747 else
6748 io_req_complete_failed(req, ret);
18bceab1
JA
6749}
6750
81459350 6751static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
aa43477b 6752{
cef216fc 6753 req->cqe.res = mask;
81459350
JA
6754 /*
6755 * This is useful for poll that is armed on behalf of another
6756 * request, and where the wakeup path could be on a different
6757 * CPU. We want to avoid pulling in req->apoll->events for that
6758 * case.
6759 */
2804ecd8 6760 req->apoll_events = events;
aa43477b
PB
6761 if (req->opcode == IORING_OP_POLL_ADD)
6762 req->io_task_work.func = io_poll_task_func;
6763 else
6764 req->io_task_work.func = io_apoll_task_func;
6765
cef216fc 6766 trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
aa43477b
PB
6767 io_req_task_work_add(req, false);
6768}
6769
81459350 6770static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
aa43477b
PB
6771{
6772 if (io_poll_get_ownership(req))
81459350 6773 __io_poll_execute(req, res, events);
aa43477b
PB
6774}
6775
6776static void io_poll_cancel_req(struct io_kiocb *req)
6777{
6778 io_poll_mark_cancelled(req);
6779 /* kick tw, which should complete the request */
81459350 6780 io_poll_execute(req, 0, 0);
aa43477b
PB
6781}
6782
d89a4fac
JA
6783#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
6784#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
6785
aa43477b
PB
6786static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
6787 void *key)
18bceab1 6788{
d89a4fac 6789 struct io_kiocb *req = wqe_to_req(wait);
aa43477b
PB
6790 struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
6791 wait);
18bceab1
JA
6792 __poll_t mask = key_to_poll(key);
6793
791f3465
PB
6794 if (unlikely(mask & POLLFREE)) {
6795 io_poll_mark_cancelled(req);
6796 /* we have to kick tw in case it's not already */
81459350 6797 io_poll_execute(req, 0, poll->events);
791f3465
PB
6798
6799 /*
6800 * If the waitqueue is being freed early but someone is already
6801 * holds ownership over it, we have to tear down the request as
6802 * best we can. That means immediately removing the request from
6803 * its waitqueue and preventing all further accesses to the
6804 * waitqueue via the request.
6805 */
6806 list_del_init(&poll->wait.entry);
6807
6808 /*
6809 * Careful: this *must* be the last step, since as soon
6810 * as req->head is NULL'ed out, the request can be
6811 * completed and freed, since aio_poll_complete_work()
6812 * will no longer need to take the waitqueue lock.
6813 */
6814 smp_store_release(&poll->head, NULL);
6815 return 1;
6816 }
6817
aa43477b 6818 /* for instances that support it check for an event match first */
18bceab1
JA
6819 if (mask && !(mask & poll->events))
6820 return 0;
6821
eb0089d6
PB
6822 if (io_poll_get_ownership(req)) {
6823 /* optional, saves extra locking for removal in tw handler */
6824 if (mask && poll->events & EPOLLONESHOT) {
6825 list_del_init(&poll->wait.entry);
6826 poll->head = NULL;
d89a4fac
JA
6827 if (wqe_is_double(wait))
6828 req->flags &= ~REQ_F_DOUBLE_POLL;
6829 else
6830 req->flags &= ~REQ_F_SINGLE_POLL;
eb0089d6 6831 }
81459350 6832 __io_poll_execute(req, mask, poll->events);
eb0089d6 6833 }
18bceab1 6834 return 1;
18bceab1
JA
6835}
6836
6837static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
807abcb0
JA
6838 struct wait_queue_head *head,
6839 struct io_poll_iocb **poll_ptr)
18bceab1
JA
6840{
6841 struct io_kiocb *req = pt->req;
d89a4fac 6842 unsigned long wqe_private = (unsigned long) req;
18bceab1
JA
6843
6844 /*
68b11e8b
PB
6845 * The file being polled uses multiple waitqueues for poll handling
6846 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
6847 * if this happens.
18bceab1 6848 */
68b11e8b 6849 if (unlikely(pt->nr_entries)) {
aa43477b 6850 struct io_poll_iocb *first = poll;
58852d4d 6851
23a65db8 6852 /* double add on the same waitqueue head, ignore */
aa43477b 6853 if (first->head == head)
23a65db8 6854 return;
18bceab1 6855 /* already have a 2nd entry, fail a third attempt */
807abcb0 6856 if (*poll_ptr) {
23a65db8
PB
6857 if ((*poll_ptr)->head == head)
6858 return;
18bceab1
JA
6859 pt->error = -EINVAL;
6860 return;
6861 }
aa43477b 6862
18bceab1
JA
6863 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
6864 if (!poll) {
6865 pt->error = -ENOMEM;
6866 return;
6867 }
d89a4fac
JA
6868 /* mark as double wq entry */
6869 wqe_private |= 1;
91eac1c6 6870 req->flags |= REQ_F_DOUBLE_POLL;
aa43477b 6871 io_init_poll_iocb(poll, first->events, first->wait.func);
807abcb0 6872 *poll_ptr = poll;
d886e185
PB
6873 if (req->opcode == IORING_OP_POLL_ADD)
6874 req->flags |= REQ_F_ASYNC_DATA;
18bceab1
JA
6875 }
6876
91eac1c6 6877 req->flags |= REQ_F_SINGLE_POLL;
68b11e8b 6878 pt->nr_entries++;
18bceab1 6879 poll->head = head;
d89a4fac 6880 poll->wait.private = (void *) wqe_private;
a31eb4a2
JX
6881
6882 if (poll->events & EPOLLEXCLUSIVE)
6883 add_wait_queue_exclusive(head, &poll->wait);
6884 else
6885 add_wait_queue(head, &poll->wait);
18bceab1
JA
6886}
6887
aa43477b 6888static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
18bceab1
JA
6889 struct poll_table_struct *p)
6890{
6891 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
d7718a9d 6892
aa43477b
PB
6893 __io_queue_proc(&pt->req->poll, pt, head,
6894 (struct io_poll_iocb **) &pt->req->async_data);
d7718a9d
JA
6895}
6896
aa43477b
PB
6897static int __io_arm_poll_handler(struct io_kiocb *req,
6898 struct io_poll_iocb *poll,
6899 struct io_poll_table *ipt, __poll_t mask)
d7718a9d
JA
6900{
6901 struct io_ring_ctx *ctx = req->ctx;
aa43477b 6902 int v;
d7718a9d 6903
4d52f338 6904 INIT_HLIST_NODE(&req->hash_node);
8e29da69 6905 req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
aa43477b 6906 io_init_poll_iocb(poll, mask, io_poll_wake);
b90cd197 6907 poll->file = req->file;
d7718a9d
JA
6908
6909 ipt->pt._key = mask;
6910 ipt->req = req;
68b11e8b
PB
6911 ipt->error = 0;
6912 ipt->nr_entries = 0;
d7718a9d 6913
aa43477b
PB
6914 /*
6915 * Take the ownership to delay any tw execution up until we're done
6916 * with poll arming. see io_poll_get_ownership().
6917 */
6918 atomic_set(&req->poll_refs, 1);
d7718a9d 6919 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
aa43477b
PB
6920
6921 if (mask && (poll->events & EPOLLONESHOT)) {
6922 io_poll_remove_entries(req);
6923 /* no one else has access to the req, forget about the ref */
6924 return mask;
6925 }
6926 if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
6927 io_poll_remove_entries(req);
6928 if (!ipt->error)
6929 ipt->error = -EINVAL;
6930 return 0;
6931 }
d7718a9d 6932
79ebeaee 6933 spin_lock(&ctx->completion_lock);
aa43477b
PB
6934 io_poll_req_insert(req);
6935 spin_unlock(&ctx->completion_lock);
6936
6937 if (mask) {
6938 /* can't multishot if failed, just queue the event we've got */
6939 if (unlikely(ipt->error || !ipt->nr_entries))
6940 poll->events |= EPOLLONESHOT;
81459350 6941 __io_poll_execute(req, mask, poll->events);
aa43477b 6942 return 0;
d7718a9d
JA
6943 }
6944
aa43477b
PB
6945 /*
6946 * Release ownership. If someone tried to queue a tw while it was
6947 * locked, kick it off for them.
6948 */
6949 v = atomic_dec_return(&req->poll_refs);
6950 if (unlikely(v & IO_POLL_REF_MASK))
81459350 6951 __io_poll_execute(req, 0, poll->events);
aa43477b
PB
6952 return 0;
6953}
6954
6955static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
6956 struct poll_table_struct *p)
6957{
6958 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
6959 struct async_poll *apoll = pt->req->apoll;
6960
6961 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
d7718a9d
JA
6962}
6963
59b735ae
OL
6964enum {
6965 IO_APOLL_OK,
6966 IO_APOLL_ABORTED,
6967 IO_APOLL_READY
6968};
6969
4d9237e3 6970static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
d7718a9d
JA
6971{
6972 const struct io_op_def *def = &io_op_defs[req->opcode];
6973 struct io_ring_ctx *ctx = req->ctx;
6974 struct async_poll *apoll;
6975 struct io_poll_table ipt;
aa43477b
PB
6976 __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
6977 int ret;
d7718a9d 6978
b2d9c3da
PB
6979 if (!def->pollin && !def->pollout)
6980 return IO_APOLL_ABORTED;
10c87333
JA
6981 if (!file_can_poll(req->file))
6982 return IO_APOLL_ABORTED;
6983 if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
658d0a40 6984 return IO_APOLL_ABORTED;
b2d9c3da
PB
6985
6986 if (def->pollin) {
b2d9c3da
PB
6987 mask |= POLLIN | POLLRDNORM;
6988
6989 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
6990 if ((req->opcode == IORING_OP_RECVMSG) &&
6991 (req->sr_msg.msg_flags & MSG_ERRQUEUE))
6992 mask &= ~POLLIN;
6993 } else {
b2d9c3da
PB
6994 mask |= POLLOUT | POLLWRNORM;
6995 }
52dd8640
DY
6996 if (def->poll_exclusive)
6997 mask |= EPOLLEXCLUSIVE;
10c87333
JA
6998 if (req->flags & REQ_F_POLLED) {
6999 apoll = req->apoll;
7000 } else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
7001 !list_empty(&ctx->apoll_cache)) {
4d9237e3
JA
7002 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
7003 poll.wait.entry);
7004 list_del_init(&apoll->poll.wait.entry);
7005 } else {
7006 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
7007 if (unlikely(!apoll))
7008 return IO_APOLL_ABORTED;
7009 }
807abcb0 7010 apoll->double_poll = NULL;
d7718a9d 7011 req->apoll = apoll;
b2d9c3da 7012 req->flags |= REQ_F_POLLED;
d7718a9d
JA
7013 ipt.pt._qproc = io_async_queue_proc;
7014
4d55f238 7015 io_kbuf_recycle(req, issue_flags);
abdad709 7016
aa43477b 7017 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
41a5169c
HX
7018 if (ret || ipt.error)
7019 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
7020
cef216fc 7021 trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode,
236daeae 7022 mask, apoll->poll.events);
59b735ae 7023 return IO_APOLL_OK;
d7718a9d
JA
7024}
7025
76e1b642
JA
7026/*
7027 * Returns true if we found and killed one or more poll requests
7028 */
c072481d
PB
7029static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
7030 struct task_struct *tsk, bool cancel_all)
221c5eb2 7031{
78076bb6 7032 struct hlist_node *tmp;
221c5eb2 7033 struct io_kiocb *req;
aa43477b
PB
7034 bool found = false;
7035 int i;
221c5eb2 7036
79ebeaee 7037 spin_lock(&ctx->completion_lock);
78076bb6
JA
7038 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7039 struct hlist_head *list;
7040
7041 list = &ctx->cancel_hash[i];
f3606e3a 7042 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
42a7b4ed 7043 if (io_match_task_safe(req, tsk, cancel_all)) {
61bc84c4 7044 hlist_del_init(&req->hash_node);
aa43477b
PB
7045 io_poll_cancel_req(req);
7046 found = true;
7047 }
f3606e3a 7048 }
221c5eb2 7049 }
79ebeaee 7050 spin_unlock(&ctx->completion_lock);
aa43477b 7051 return found;
221c5eb2
JA
7052}
7053
b21432b4
JA
7054static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
7055 struct io_cancel_data *cd)
e07785b0 7056 __must_hold(&ctx->completion_lock)
47f46768 7057{
78076bb6 7058 struct hlist_head *list;
47f46768
JA
7059 struct io_kiocb *req;
7060
b21432b4 7061 list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)];
78076bb6 7062 hlist_for_each_entry(req, list, hash_node) {
b21432b4 7063 if (cd->data != req->cqe.user_data)
b41e9852 7064 continue;
9ba5fac8
PB
7065 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
7066 continue;
8e29da69
JA
7067 if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
7068 if (cd->seq == req->work.cancel_seq)
7069 continue;
7070 req->work.cancel_seq = cd->seq;
7071 }
b2cb805f 7072 return req;
47f46768 7073 }
b2cb805f
JA
7074 return NULL;
7075}
7076
4bf94615
JA
7077static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
7078 struct io_cancel_data *cd)
7079 __must_hold(&ctx->completion_lock)
7080{
7081 struct io_kiocb *req;
7082 int i;
7083
7084 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7085 struct hlist_head *list;
7086
7087 list = &ctx->cancel_hash[i];
7088 hlist_for_each_entry(req, list, hash_node) {
970f256e
JA
7089 if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
7090 req->file != cd->file)
4bf94615
JA
7091 continue;
7092 if (cd->seq == req->work.cancel_seq)
7093 continue;
7094 req->work.cancel_seq = cd->seq;
7095 return req;
7096 }
7097 }
7098 return NULL;
7099}
7100
aa43477b
PB
7101static bool io_poll_disarm(struct io_kiocb *req)
7102 __must_hold(&ctx->completion_lock)
7103{
7104 if (!io_poll_get_ownership(req))
7105 return false;
7106 io_poll_remove_entries(req);
7107 hash_del(&req->hash_node);
7108 return true;
7109}
7110
b21432b4 7111static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
e07785b0 7112 __must_hold(&ctx->completion_lock)
b2cb805f 7113{
4bf94615 7114 struct io_kiocb *req;
b2cb805f 7115
970f256e 7116 if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
4bf94615
JA
7117 req = io_poll_file_find(ctx, cd);
7118 else
7119 req = io_poll_find(ctx, false, cd);
b2cb805f
JA
7120 if (!req)
7121 return -ENOENT;
aa43477b
PB
7122 io_poll_cancel_req(req);
7123 return 0;
47f46768
JA
7124}
7125
9096af3e
PB
7126static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
7127 unsigned int flags)
7128{
7129 u32 events;
47f46768 7130
9096af3e
PB
7131 events = READ_ONCE(sqe->poll32_events);
7132#ifdef __BIG_ENDIAN
7133 events = swahw32(events);
7134#endif
7135 if (!(flags & IORING_POLL_ADD_MULTI))
7136 events |= EPOLLONESHOT;
7137 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
47f46768
JA
7138}
7139
c5de0036 7140static int io_poll_update_prep(struct io_kiocb *req,
3529d8c2 7141 const struct io_uring_sqe *sqe)
0969e783 7142{
c5de0036
PB
7143 struct io_poll_update *upd = &req->poll_update;
7144 u32 flags;
7145
73911426 7146 if (sqe->buf_index || sqe->splice_fd_in)
c5de0036
PB
7147 return -EINVAL;
7148 flags = READ_ONCE(sqe->len);
7149 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
7150 IORING_POLL_ADD_MULTI))
7151 return -EINVAL;
7152 /* meaningless without update */
7153 if (flags == IORING_POLL_ADD_MULTI)
0969e783
JA
7154 return -EINVAL;
7155
c5de0036
PB
7156 upd->old_user_data = READ_ONCE(sqe->addr);
7157 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
7158 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
221c5eb2 7159
c5de0036
PB
7160 upd->new_user_data = READ_ONCE(sqe->off);
7161 if (!upd->update_user_data && upd->new_user_data)
7162 return -EINVAL;
7163 if (upd->update_events)
7164 upd->events = io_poll_parse_events(sqe, flags);
7165 else if (sqe->poll32_events)
7166 return -EINVAL;
221c5eb2 7167
221c5eb2
JA
7168 return 0;
7169}
7170
3529d8c2 7171static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
221c5eb2
JA
7172{
7173 struct io_poll_iocb *poll = &req->poll;
c5de0036 7174 u32 flags;
221c5eb2 7175
73911426 7176 if (sqe->buf_index || sqe->off || sqe->addr)
88e41cf9
JA
7177 return -EINVAL;
7178 flags = READ_ONCE(sqe->len);
c5de0036 7179 if (flags & ~IORING_POLL_ADD_MULTI)
221c5eb2 7180 return -EINVAL;
04c76b41
PB
7181 if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
7182 return -EINVAL;
221c5eb2 7183
48dcd38d 7184 io_req_set_refcount(req);
2804ecd8 7185 req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
0969e783
JA
7186 return 0;
7187}
7188
61e98203 7189static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
0969e783
JA
7190{
7191 struct io_poll_iocb *poll = &req->poll;
0969e783 7192 struct io_poll_table ipt;
aa43477b 7193 int ret;
0969e783 7194
d7718a9d 7195 ipt.pt._qproc = io_poll_queue_proc;
36703247 7196
aa43477b
PB
7197 ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
7198 ret = ret ?: ipt.error;
7199 if (ret)
7200 __io_req_complete(req, issue_flags, ret, 0);
7201 return 0;
221c5eb2
JA
7202}
7203
c5de0036 7204static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
b69de288 7205{
b21432b4 7206 struct io_cancel_data cd = { .data = req->poll_update.old_user_data, };
b69de288
JA
7207 struct io_ring_ctx *ctx = req->ctx;
7208 struct io_kiocb *preq;
2bbb146d 7209 int ret2, ret = 0;
cc8e9ba7 7210 bool locked;
b69de288 7211
79ebeaee 7212 spin_lock(&ctx->completion_lock);
b21432b4 7213 preq = io_poll_find(ctx, true, &cd);
aa43477b 7214 if (!preq || !io_poll_disarm(preq)) {
79ebeaee 7215 spin_unlock(&ctx->completion_lock);
aa43477b 7216 ret = preq ? -EALREADY : -ENOENT;
2bbb146d 7217 goto out;
b69de288 7218 }
79ebeaee 7219 spin_unlock(&ctx->completion_lock);
cb3b200e 7220
2bbb146d
PB
7221 if (req->poll_update.update_events || req->poll_update.update_user_data) {
7222 /* only mask one event flags, keep behavior flags */
7223 if (req->poll_update.update_events) {
7224 preq->poll.events &= ~0xffff;
7225 preq->poll.events |= req->poll_update.events & 0xffff;
7226 preq->poll.events |= IO_POLL_UNMASK;
cb3b200e 7227 }
2bbb146d 7228 if (req->poll_update.update_user_data)
cef216fc 7229 preq->cqe.user_data = req->poll_update.new_user_data;
b69de288 7230
2bbb146d
PB
7231 ret2 = io_poll_add(preq, issue_flags);
7232 /* successfully updated, don't complete poll request */
7233 if (!ret2)
7234 goto out;
b69de288 7235 }
6224590d 7236
2bbb146d 7237 req_set_fail(preq);
cef216fc 7238 preq->cqe.res = -ECANCELED;
cc8e9ba7
PB
7239 locked = !(issue_flags & IO_URING_F_UNLOCKED);
7240 io_req_task_complete(preq, &locked);
2bbb146d
PB
7241out:
7242 if (ret < 0)
6224590d 7243 req_set_fail(req);
2bbb146d 7244 /* complete update request, we're done with it */
cc8e9ba7 7245 __io_req_complete(req, issue_flags, ret, 0);
b69de288 7246 return 0;
89850fce
JA
7247}
7248
5262f567
JA
7249static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
7250{
ad8a48ac
JA
7251 struct io_timeout_data *data = container_of(timer,
7252 struct io_timeout_data, timer);
7253 struct io_kiocb *req = data->req;
7254 struct io_ring_ctx *ctx = req->ctx;
5262f567
JA
7255 unsigned long flags;
7256
89850fce 7257 spin_lock_irqsave(&ctx->timeout_lock, flags);
a71976f3 7258 list_del_init(&req->timeout.list);
01cec8c1
PB
7259 atomic_set(&req->ctx->cq_timeouts,
7260 atomic_read(&req->ctx->cq_timeouts) + 1);
89850fce 7261 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
01cec8c1 7262
a90c8bf6
PB
7263 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
7264 req_set_fail(req);
7265
cef216fc 7266 req->cqe.res = -ETIME;
a90c8bf6 7267 req->io_task_work.func = io_req_task_complete;
4813c377 7268 io_req_task_work_add(req, false);
5262f567
JA
7269 return HRTIMER_NORESTART;
7270}
7271
fbd15848 7272static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
b21432b4 7273 struct io_cancel_data *cd)
89850fce 7274 __must_hold(&ctx->timeout_lock)
f254ac04 7275{
fbd15848 7276 struct io_timeout_data *io;
47f46768 7277 struct io_kiocb *req;
fd9c7bc5 7278 bool found = false;
f254ac04 7279
135fcde8 7280 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
970f256e
JA
7281 if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
7282 cd->data != req->cqe.user_data)
8e29da69 7283 continue;
970f256e 7284 if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
8e29da69
JA
7285 if (cd->seq == req->work.cancel_seq)
7286 continue;
7287 req->work.cancel_seq = cd->seq;
7288 }
7289 found = true;
7290 break;
47f46768 7291 }
fd9c7bc5
PB
7292 if (!found)
7293 return ERR_PTR(-ENOENT);
fbd15848
PB
7294
7295 io = req->async_data;
fd9c7bc5 7296 if (hrtimer_try_to_cancel(&io->timer) == -1)
fbd15848 7297 return ERR_PTR(-EALREADY);
a71976f3 7298 list_del_init(&req->timeout.list);
fbd15848
PB
7299 return req;
7300}
47f46768 7301
b21432b4 7302static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
ec3c3d0f 7303 __must_hold(&ctx->completion_lock)
fbd15848 7304{
3645c200
PB
7305 struct io_kiocb *req;
7306
7307 spin_lock_irq(&ctx->timeout_lock);
b21432b4 7308 req = io_timeout_extract(ctx, cd);
3645c200 7309 spin_unlock_irq(&ctx->timeout_lock);
fbd15848
PB
7310
7311 if (IS_ERR(req))
7312 return PTR_ERR(req);
6695490d 7313 io_req_task_queue_fail(req, -ECANCELED);
f254ac04
JA
7314 return 0;
7315}
7316
50c1df2b
JA
7317static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
7318{
7319 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
7320 case IORING_TIMEOUT_BOOTTIME:
7321 return CLOCK_BOOTTIME;
7322 case IORING_TIMEOUT_REALTIME:
7323 return CLOCK_REALTIME;
7324 default:
7325 /* can't happen, vetted at prep time */
7326 WARN_ON_ONCE(1);
7327 fallthrough;
7328 case 0:
7329 return CLOCK_MONOTONIC;
7330 }
7331}
7332
f1042b6c
PB
7333static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
7334 struct timespec64 *ts, enum hrtimer_mode mode)
7335 __must_hold(&ctx->timeout_lock)
7336{
7337 struct io_timeout_data *io;
7338 struct io_kiocb *req;
7339 bool found = false;
7340
7341 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
cef216fc 7342 found = user_data == req->cqe.user_data;
f1042b6c
PB
7343 if (found)
7344 break;
7345 }
7346 if (!found)
7347 return -ENOENT;
7348
7349 io = req->async_data;
7350 if (hrtimer_try_to_cancel(&io->timer) == -1)
7351 return -EALREADY;
7352 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
7353 io->timer.function = io_link_timeout_fn;
7354 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
7355 return 0;
7356}
7357
9c8e11b3
PB
7358static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
7359 struct timespec64 *ts, enum hrtimer_mode mode)
89850fce 7360 __must_hold(&ctx->timeout_lock)
47f46768 7361{
b21432b4
JA
7362 struct io_cancel_data cd = { .data = user_data, };
7363 struct io_kiocb *req = io_timeout_extract(ctx, &cd);
9c8e11b3 7364 struct io_timeout_data *data;
47f46768 7365
9c8e11b3
PB
7366 if (IS_ERR(req))
7367 return PTR_ERR(req);
47f46768 7368
9c8e11b3
PB
7369 req->timeout.off = 0; /* noseq */
7370 data = req->async_data;
7371 list_add_tail(&req->timeout.list, &ctx->timeout_list);
50c1df2b 7372 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
9c8e11b3
PB
7373 data->timer.function = io_timeout_fn;
7374 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
7375 return 0;
47f46768
JA
7376}
7377
3529d8c2
JA
7378static int io_timeout_remove_prep(struct io_kiocb *req,
7379 const struct io_uring_sqe *sqe)
b29472ee 7380{
9c8e11b3
PB
7381 struct io_timeout_rem *tr = &req->timeout_rem;
7382
61710e43
DA
7383 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
7384 return -EINVAL;
73911426 7385 if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
b29472ee
JA
7386 return -EINVAL;
7387
f1042b6c 7388 tr->ltimeout = false;
9c8e11b3
PB
7389 tr->addr = READ_ONCE(sqe->addr);
7390 tr->flags = READ_ONCE(sqe->timeout_flags);
f1042b6c
PB
7391 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
7392 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
7393 return -EINVAL;
7394 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
7395 tr->ltimeout = true;
7396 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
9c8e11b3
PB
7397 return -EINVAL;
7398 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
7399 return -EFAULT;
2087009c
YB
7400 if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
7401 return -EINVAL;
9c8e11b3
PB
7402 } else if (tr->flags) {
7403 /* timeout removal doesn't support flags */
b29472ee 7404 return -EINVAL;
9c8e11b3 7405 }
b29472ee 7406
b29472ee
JA
7407 return 0;
7408}
7409
8662daec
PB
7410static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
7411{
7412 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
7413 : HRTIMER_MODE_REL;
7414}
7415
11365043
JA
7416/*
7417 * Remove or update an existing timeout command
7418 */
61e98203 7419static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
11365043 7420{
9c8e11b3 7421 struct io_timeout_rem *tr = &req->timeout_rem;
11365043 7422 struct io_ring_ctx *ctx = req->ctx;
47f46768 7423 int ret;
11365043 7424
ec3c3d0f 7425 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
b21432b4
JA
7426 struct io_cancel_data cd = { .data = tr->addr, };
7427
ec3c3d0f 7428 spin_lock(&ctx->completion_lock);
b21432b4 7429 ret = io_timeout_cancel(ctx, &cd);
ec3c3d0f
PB
7430 spin_unlock(&ctx->completion_lock);
7431 } else {
f1042b6c
PB
7432 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
7433
ec3c3d0f 7434 spin_lock_irq(&ctx->timeout_lock);
f1042b6c
PB
7435 if (tr->ltimeout)
7436 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
7437 else
7438 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
ec3c3d0f
PB
7439 spin_unlock_irq(&ctx->timeout_lock);
7440 }
11365043 7441
4e88d6e7 7442 if (ret < 0)
93d2bcd2 7443 req_set_fail(req);
505657bc 7444 io_req_complete_post(req, ret, 0);
11365043 7445 return 0;
5262f567
JA
7446}
7447
3529d8c2 7448static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2d28390a 7449 bool is_timeout_link)
5262f567 7450{
ad8a48ac 7451 struct io_timeout_data *data;
a41525ab 7452 unsigned flags;
56080b02 7453 u32 off = READ_ONCE(sqe->off);
5262f567 7454
73911426 7455 if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
a41525ab 7456 return -EINVAL;
56080b02 7457 if (off && is_timeout_link)
2d28390a 7458 return -EINVAL;
a41525ab 7459 flags = READ_ONCE(sqe->timeout_flags);
6224590d
PB
7460 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
7461 IORING_TIMEOUT_ETIME_SUCCESS))
50c1df2b
JA
7462 return -EINVAL;
7463 /* more than one clock specified is invalid, obviously */
7464 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
5262f567 7465 return -EINVAL;
bdf20073 7466
ef9dd637 7467 INIT_LIST_HEAD(&req->timeout.list);
bfe68a22 7468 req->timeout.off = off;
f18ee4cf
PB
7469 if (unlikely(off && !req->ctx->off_timeout_used))
7470 req->ctx->off_timeout_used = true;
26a61679 7471
d6a644a7
PB
7472 if (WARN_ON_ONCE(req_has_async_data(req)))
7473 return -EFAULT;
7474 if (io_alloc_async_data(req))
26a61679
JA
7475 return -ENOMEM;
7476
e8c2bc1f 7477 data = req->async_data;
ad8a48ac 7478 data->req = req;
50c1df2b 7479 data->flags = flags;
ad8a48ac
JA
7480
7481 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5262f567
JA
7482 return -EFAULT;
7483
f6223ff7
YB
7484 if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
7485 return -EINVAL;
7486
e677edbc 7487 INIT_LIST_HEAD(&req->timeout.list);
8662daec 7488 data->mode = io_translate_timeout_mode(flags);
50c1df2b 7489 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
b97e736a
PB
7490
7491 if (is_timeout_link) {
7492 struct io_submit_link *link = &req->ctx->submit_state.link;
7493
7494 if (!link->head)
7495 return -EINVAL;
7496 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
7497 return -EINVAL;
4d13d1a4
PB
7498 req->timeout.head = link->last;
7499 link->last->flags |= REQ_F_ARM_LTIMEOUT;
b97e736a 7500 }
ad8a48ac
JA
7501 return 0;
7502}
7503
61e98203 7504static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
ad8a48ac 7505{
ad8a48ac 7506 struct io_ring_ctx *ctx = req->ctx;
e8c2bc1f 7507 struct io_timeout_data *data = req->async_data;
ad8a48ac 7508 struct list_head *entry;
bfe68a22 7509 u32 tail, off = req->timeout.off;
ad8a48ac 7510
89850fce 7511 spin_lock_irq(&ctx->timeout_lock);
93bd25bb 7512
5262f567
JA
7513 /*
7514 * sqe->off holds how many events that need to occur for this
93bd25bb
JA
7515 * timeout event to be satisfied. If it isn't set, then this is
7516 * a pure timeout request, sequence isn't used.
5262f567 7517 */
8eb7e2d0 7518 if (io_is_timeout_noseq(req)) {
93bd25bb
JA
7519 entry = ctx->timeout_list.prev;
7520 goto add;
7521 }
5262f567 7522
bfe68a22
PB
7523 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
7524 req->timeout.target_seq = tail + off;
5262f567 7525
f010505b
MDG
7526 /* Update the last seq here in case io_flush_timeouts() hasn't.
7527 * This is safe because ->completion_lock is held, and submissions
7528 * and completions are never mixed in the same ->completion_lock section.
7529 */
7530 ctx->cq_last_tm_flush = tail;
7531
5262f567
JA
7532 /*
7533 * Insertion sort, ensuring the first entry in the list is always
7534 * the one we need first.
7535 */
5262f567 7536 list_for_each_prev(entry, &ctx->timeout_list) {
135fcde8
PB
7537 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
7538 timeout.list);
5262f567 7539
8eb7e2d0 7540 if (io_is_timeout_noseq(nxt))
93bd25bb 7541 continue;
bfe68a22
PB
7542 /* nxt.seq is behind @tail, otherwise would've been completed */
7543 if (off >= nxt->timeout.target_seq - tail)
5262f567
JA
7544 break;
7545 }
93bd25bb 7546add:
135fcde8 7547 list_add(&req->timeout.list, entry);
ad8a48ac
JA
7548 data->timer.function = io_timeout_fn;
7549 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
89850fce 7550 spin_unlock_irq(&ctx->timeout_lock);
5262f567
JA
7551 return 0;
7552}
5262f567 7553
62755e35
JA
7554static bool io_cancel_cb(struct io_wq_work *work, void *data)
7555{
7556 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f458dd84 7557 struct io_cancel_data *cd = data;
62755e35 7558
8e29da69
JA
7559 if (req->ctx != cd->ctx)
7560 return false;
970f256e
JA
7561 if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
7562 ;
7563 } else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
4bf94615
JA
7564 if (req->file != cd->file)
7565 return false;
7566 } else {
7567 if (req->cqe.user_data != cd->data)
7568 return false;
7569 }
970f256e 7570 if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
8e29da69
JA
7571 if (cd->seq == req->work.cancel_seq)
7572 return false;
7573 req->work.cancel_seq = cd->seq;
7574 }
7575 return true;
62755e35
JA
7576}
7577
b21432b4
JA
7578static int io_async_cancel_one(struct io_uring_task *tctx,
7579 struct io_cancel_data *cd)
62755e35 7580{
62755e35 7581 enum io_wq_cancel cancel_ret;
62755e35 7582 int ret = 0;
970f256e 7583 bool all;
62755e35 7584
f458dd84 7585 if (!tctx || !tctx->io_wq)
5aa75ed5
JA
7586 return -ENOENT;
7587
970f256e
JA
7588 all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
7589 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
62755e35
JA
7590 switch (cancel_ret) {
7591 case IO_WQ_CANCEL_OK:
7592 ret = 0;
7593 break;
7594 case IO_WQ_CANCEL_RUNNING:
7595 ret = -EALREADY;
7596 break;
7597 case IO_WQ_CANCEL_NOTFOUND:
7598 ret = -ENOENT;
7599 break;
7600 }
7601
e977d6d3
JA
7602 return ret;
7603}
7604
b21432b4 7605static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd)
47f46768 7606{
8cb01fac 7607 struct io_ring_ctx *ctx = req->ctx;
47f46768
JA
7608 int ret;
7609
dadebc35 7610 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
8cb01fac 7611
b21432b4 7612 ret = io_async_cancel_one(req->task->io_uring, cd);
ccbf7261
JA
7613 /*
7614 * Fall-through even for -EALREADY, as we may have poll armed
7615 * that need unarming.
7616 */
7617 if (!ret)
7618 return 0;
505657bc
PB
7619
7620 spin_lock(&ctx->completion_lock);
b21432b4 7621 ret = io_poll_cancel(ctx, cd);
ccbf7261
JA
7622 if (ret != -ENOENT)
7623 goto out;
4bf94615
JA
7624 if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
7625 ret = io_timeout_cancel(ctx, cd);
505657bc
PB
7626out:
7627 spin_unlock(&ctx->completion_lock);
7628 return ret;
47f46768
JA
7629}
7630
970f256e
JA
7631#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
7632 IORING_ASYNC_CANCEL_ANY)
7633
3529d8c2
JA
7634static int io_async_cancel_prep(struct io_kiocb *req,
7635 const struct io_uring_sqe *sqe)
e977d6d3 7636{
4bf94615 7637 if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
61710e43 7638 return -EINVAL;
73911426 7639 if (sqe->off || sqe->len || sqe->splice_fd_in)
e977d6d3
JA
7640 return -EINVAL;
7641
fbf23849 7642 req->cancel.addr = READ_ONCE(sqe->addr);
8e29da69 7643 req->cancel.flags = READ_ONCE(sqe->cancel_flags);
970f256e 7644 if (req->cancel.flags & ~CANCEL_FLAGS)
8e29da69 7645 return -EINVAL;
970f256e
JA
7646 if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) {
7647 if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY)
7648 return -EINVAL;
4bf94615 7649 req->cancel.fd = READ_ONCE(sqe->fd);
970f256e 7650 }
8e29da69 7651
fbf23849
JA
7652 return 0;
7653}
7654
8e29da69
JA
7655static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req,
7656 unsigned int issue_flags)
fbf23849 7657{
970f256e 7658 bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
8e29da69 7659 struct io_ring_ctx *ctx = cd->ctx;
58f99373 7660 struct io_tctx_node *node;
8e29da69 7661 int ret, nr = 0;
58f99373 7662
8e29da69
JA
7663 do {
7664 ret = io_try_cancel(req, cd);
7665 if (ret == -ENOENT)
7666 break;
970f256e 7667 if (!all)
8e29da69
JA
7668 return ret;
7669 nr++;
7670 } while (1);
58f99373
PB
7671
7672 /* slow path, try all io-wq's */
f8929630 7673 io_ring_submit_lock(ctx, issue_flags);
58f99373
PB
7674 ret = -ENOENT;
7675 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
7676 struct io_uring_task *tctx = node->task->io_uring;
fbf23849 7677
8e29da69
JA
7678 ret = io_async_cancel_one(tctx, cd);
7679 if (ret != -ENOENT) {
970f256e 7680 if (!all)
8e29da69
JA
7681 break;
7682 nr++;
7683 }
58f99373 7684 }
f8929630 7685 io_ring_submit_unlock(ctx, issue_flags);
970f256e 7686 return all ? nr : ret;
8e29da69
JA
7687}
7688
7689static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
7690{
7691 struct io_cancel_data cd = {
7692 .ctx = req->ctx,
7693 .data = req->cancel.addr,
7694 .flags = req->cancel.flags,
7695 .seq = atomic_inc_return(&req->ctx->cancel_seq),
7696 };
7697 int ret;
7698
4bf94615
JA
7699 if (cd.flags & IORING_ASYNC_CANCEL_FD) {
7700 if (req->flags & REQ_F_FIXED_FILE)
7701 req->file = io_file_get_fixed(req, req->cancel.fd,
7702 issue_flags);
7703 else
7704 req->file = io_file_get_normal(req, req->cancel.fd);
7705 if (!req->file) {
7706 ret = -EBADF;
7707 goto done;
7708 }
7709 cd.file = req->file;
7710 }
7711
8e29da69 7712 ret = __io_async_cancel(&cd, req, issue_flags);
58f99373 7713done:
58f99373 7714 if (ret < 0)
93d2bcd2 7715 req_set_fail(req);
505657bc 7716 io_req_complete_post(req, ret, 0);
5262f567
JA
7717 return 0;
7718}
7719
269bbe5f 7720static int io_rsrc_update_prep(struct io_kiocb *req,
05f3fb3c
JA
7721 const struct io_uring_sqe *sqe)
7722{
61710e43
DA
7723 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
7724 return -EINVAL;
73911426 7725 if (sqe->rw_flags || sqe->splice_fd_in)
05f3fb3c
JA
7726 return -EINVAL;
7727
269bbe5f
BM
7728 req->rsrc_update.offset = READ_ONCE(sqe->off);
7729 req->rsrc_update.nr_args = READ_ONCE(sqe->len);
7730 if (!req->rsrc_update.nr_args)
05f3fb3c 7731 return -EINVAL;
269bbe5f 7732 req->rsrc_update.arg = READ_ONCE(sqe->addr);
05f3fb3c
JA
7733 return 0;
7734}
7735
889fca73 7736static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
fbf23849
JA
7737{
7738 struct io_ring_ctx *ctx = req->ctx;
c3bdad02 7739 struct io_uring_rsrc_update2 up;
05f3fb3c 7740 int ret;
fbf23849 7741
269bbe5f
BM
7742 up.offset = req->rsrc_update.offset;
7743 up.data = req->rsrc_update.arg;
c3bdad02
PB
7744 up.nr = 0;
7745 up.tags = 0;
615cee49 7746 up.resv = 0;
d8a3ba9c 7747 up.resv2 = 0;
05f3fb3c 7748
f8929630 7749 io_ring_submit_lock(ctx, issue_flags);
fdecb662 7750 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
98f0b3b4 7751 &up, req->rsrc_update.nr_args);
f8929630 7752 io_ring_submit_unlock(ctx, issue_flags);
05f3fb3c
JA
7753
7754 if (ret < 0)
93d2bcd2 7755 req_set_fail(req);
889fca73 7756 __io_req_complete(req, issue_flags, ret, 0);
5262f567
JA
7757 return 0;
7758}
7759
bfe76559 7760static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
f67676d1 7761{
d625c6ee 7762 switch (req->opcode) {
e781573e 7763 case IORING_OP_NOP:
2bb04df7 7764 return io_nop_prep(req, sqe);
f67676d1
JA
7765 case IORING_OP_READV:
7766 case IORING_OP_READ_FIXED:
3a6820f2 7767 case IORING_OP_READ:
f67676d1
JA
7768 case IORING_OP_WRITEV:
7769 case IORING_OP_WRITE_FIXED:
3a6820f2 7770 case IORING_OP_WRITE:
584b0180 7771 return io_prep_rw(req, sqe);
0969e783 7772 case IORING_OP_POLL_ADD:
bfe76559 7773 return io_poll_add_prep(req, sqe);
0969e783 7774 case IORING_OP_POLL_REMOVE:
c5de0036 7775 return io_poll_update_prep(req, sqe);
8ed8d3c3 7776 case IORING_OP_FSYNC:
1155c76a 7777 return io_fsync_prep(req, sqe);
8ed8d3c3 7778 case IORING_OP_SYNC_FILE_RANGE:
1155c76a 7779 return io_sfr_prep(req, sqe);
03b1230c 7780 case IORING_OP_SENDMSG:
fddaface 7781 case IORING_OP_SEND:
bfe76559 7782 return io_sendmsg_prep(req, sqe);
03b1230c 7783 case IORING_OP_RECVMSG:
fddaface 7784 case IORING_OP_RECV:
bfe76559 7785 return io_recvmsg_prep(req, sqe);
f499a021 7786 case IORING_OP_CONNECT:
bfe76559 7787 return io_connect_prep(req, sqe);
2d28390a 7788 case IORING_OP_TIMEOUT:
bfe76559 7789 return io_timeout_prep(req, sqe, false);
b29472ee 7790 case IORING_OP_TIMEOUT_REMOVE:
bfe76559 7791 return io_timeout_remove_prep(req, sqe);
fbf23849 7792 case IORING_OP_ASYNC_CANCEL:
bfe76559 7793 return io_async_cancel_prep(req, sqe);
2d28390a 7794 case IORING_OP_LINK_TIMEOUT:
bfe76559 7795 return io_timeout_prep(req, sqe, true);
8ed8d3c3 7796 case IORING_OP_ACCEPT:
bfe76559 7797 return io_accept_prep(req, sqe);
d63d1b5e 7798 case IORING_OP_FALLOCATE:
bfe76559 7799 return io_fallocate_prep(req, sqe);
15b71abe 7800 case IORING_OP_OPENAT:
bfe76559 7801 return io_openat_prep(req, sqe);
b5dba59e 7802 case IORING_OP_CLOSE:
bfe76559 7803 return io_close_prep(req, sqe);
05f3fb3c 7804 case IORING_OP_FILES_UPDATE:
269bbe5f 7805 return io_rsrc_update_prep(req, sqe);
eddc7ef5 7806 case IORING_OP_STATX:
bfe76559 7807 return io_statx_prep(req, sqe);
4840e418 7808 case IORING_OP_FADVISE:
bfe76559 7809 return io_fadvise_prep(req, sqe);
c1ca757b 7810 case IORING_OP_MADVISE:
bfe76559 7811 return io_madvise_prep(req, sqe);
cebdb986 7812 case IORING_OP_OPENAT2:
bfe76559 7813 return io_openat2_prep(req, sqe);
3e4827b0 7814 case IORING_OP_EPOLL_CTL:
bfe76559 7815 return io_epoll_ctl_prep(req, sqe);
7d67af2c 7816 case IORING_OP_SPLICE:
bfe76559 7817 return io_splice_prep(req, sqe);
ddf0322d 7818 case IORING_OP_PROVIDE_BUFFERS:
bfe76559 7819 return io_provide_buffers_prep(req, sqe);
067524e9 7820 case IORING_OP_REMOVE_BUFFERS:
bfe76559 7821 return io_remove_buffers_prep(req, sqe);
f2a8d5c7 7822 case IORING_OP_TEE:
bfe76559 7823 return io_tee_prep(req, sqe);
36f4fa68
JA
7824 case IORING_OP_SHUTDOWN:
7825 return io_shutdown_prep(req, sqe);
80a261fd
JA
7826 case IORING_OP_RENAMEAT:
7827 return io_renameat_prep(req, sqe);
14a1143b
JA
7828 case IORING_OP_UNLINKAT:
7829 return io_unlinkat_prep(req, sqe);
e34a02dc
DK
7830 case IORING_OP_MKDIRAT:
7831 return io_mkdirat_prep(req, sqe);
7a8721f8
DK
7832 case IORING_OP_SYMLINKAT:
7833 return io_symlinkat_prep(req, sqe);
cf30da90
DK
7834 case IORING_OP_LINKAT:
7835 return io_linkat_prep(req, sqe);
4f57f06c
JA
7836 case IORING_OP_MSG_RING:
7837 return io_msg_ring_prep(req, sqe);
e9621e2b
SR
7838 case IORING_OP_FSETXATTR:
7839 return io_fsetxattr_prep(req, sqe);
7840 case IORING_OP_SETXATTR:
7841 return io_setxattr_prep(req, sqe);
a56834e0
SR
7842 case IORING_OP_FGETXATTR:
7843 return io_fgetxattr_prep(req, sqe);
7844 case IORING_OP_GETXATTR:
7845 return io_getxattr_prep(req, sqe);
1374e08e
JA
7846 case IORING_OP_SOCKET:
7847 return io_socket_prep(req, sqe);
ee692a21
JA
7848 case IORING_OP_URING_CMD:
7849 return io_uring_cmd_prep(req, sqe);
f67676d1
JA
7850 }
7851
bfe76559
PB
7852 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
7853 req->opcode);
bd54b6fe 7854 return -EINVAL;
bfe76559
PB
7855}
7856
93642ef8 7857static int io_req_prep_async(struct io_kiocb *req)
bfe76559 7858{
a196c78b
JA
7859 const struct io_op_def *def = &io_op_defs[req->opcode];
7860
7861 /* assign early for deferred execution for non-fixed file */
7862 if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
13086899 7863 req->file = io_file_get_normal(req, req->cqe.fd);
a196c78b 7864 if (!def->needs_async_setup)
b7e298d2 7865 return 0;
d886e185 7866 if (WARN_ON_ONCE(req_has_async_data(req)))
b7e298d2
PB
7867 return -EFAULT;
7868 if (io_alloc_async_data(req))
7869 return -EAGAIN;
7870
93642ef8
PB
7871 switch (req->opcode) {
7872 case IORING_OP_READV:
93642ef8
PB
7873 return io_rw_prep_async(req, READ);
7874 case IORING_OP_WRITEV:
93642ef8
PB
7875 return io_rw_prep_async(req, WRITE);
7876 case IORING_OP_SENDMSG:
93642ef8
PB
7877 return io_sendmsg_prep_async(req);
7878 case IORING_OP_RECVMSG:
93642ef8
PB
7879 return io_recvmsg_prep_async(req);
7880 case IORING_OP_CONNECT:
7881 return io_connect_prep_async(req);
ee692a21
JA
7882 case IORING_OP_URING_CMD:
7883 return io_uring_cmd_prep_async(req);
93642ef8 7884 }
b7e298d2
PB
7885 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
7886 req->opcode);
7887 return -EFAULT;
f67676d1
JA
7888}
7889
9cf7c104
PB
7890static u32 io_get_sequence(struct io_kiocb *req)
7891{
a3dbdf54 7892 u32 seq = req->ctx->cached_sq_head;
963c6abb 7893 struct io_kiocb *cur;
9cf7c104 7894
a3dbdf54 7895 /* need original cached_sq_head, but it was increased for each req */
963c6abb 7896 io_for_each_link(cur, req)
a3dbdf54
PB
7897 seq--;
7898 return seq;
9cf7c104
PB
7899}
7900
c072481d 7901static __cold void io_drain_req(struct io_kiocb *req)
de0617e4 7902{
a197f664 7903 struct io_ring_ctx *ctx = req->ctx;
27dc8338 7904 struct io_defer_entry *de;
f67676d1 7905 int ret;
e0eb71dc 7906 u32 seq = io_get_sequence(req);
3c19966d 7907
9d858b21 7908 /* Still need defer if there is pending req in defer list. */
e302f104 7909 spin_lock(&ctx->completion_lock);
5e371265 7910 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
e302f104 7911 spin_unlock(&ctx->completion_lock);
e0eb71dc 7912queue:
10c66904 7913 ctx->drain_active = false;
e0eb71dc
PB
7914 io_req_task_queue(req);
7915 return;
10c66904 7916 }
e302f104 7917 spin_unlock(&ctx->completion_lock);
9cf7c104 7918
b7e298d2 7919 ret = io_req_prep_async(req);
e0eb71dc
PB
7920 if (ret) {
7921fail:
7922 io_req_complete_failed(req, ret);
7923 return;
7924 }
cbdcb435 7925 io_prep_async_link(req);
27dc8338 7926 de = kmalloc(sizeof(*de), GFP_KERNEL);
76cc33d7 7927 if (!de) {
1b48773f 7928 ret = -ENOMEM;
e0eb71dc 7929 goto fail;
76cc33d7 7930 }
2d28390a 7931
79ebeaee 7932 spin_lock(&ctx->completion_lock);
9cf7c104 7933 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
79ebeaee 7934 spin_unlock(&ctx->completion_lock);
27dc8338 7935 kfree(de);
e0eb71dc 7936 goto queue;
de0617e4
JA
7937 }
7938
cef216fc 7939 trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
27dc8338 7940 de->req = req;
9cf7c104 7941 de->seq = seq;
27dc8338 7942 list_add_tail(&de->list, &ctx->defer_list);
79ebeaee 7943 spin_unlock(&ctx->completion_lock);
de0617e4
JA
7944}
7945
68fb8979 7946static void io_clean_op(struct io_kiocb *req)
99bc4c38 7947{
8197b053
PB
7948 if (req->flags & REQ_F_BUFFER_SELECTED) {
7949 spin_lock(&req->ctx->completion_lock);
cc3cec83 7950 io_put_kbuf_comp(req);
8197b053
PB
7951 spin_unlock(&req->ctx->completion_lock);
7952 }
99bc4c38 7953
0e1b6fe3
PB
7954 if (req->flags & REQ_F_NEED_CLEANUP) {
7955 switch (req->opcode) {
7956 case IORING_OP_READV:
7957 case IORING_OP_READ_FIXED:
7958 case IORING_OP_READ:
7959 case IORING_OP_WRITEV:
7960 case IORING_OP_WRITE_FIXED:
e8c2bc1f
JA
7961 case IORING_OP_WRITE: {
7962 struct io_async_rw *io = req->async_data;
1dacb4df
PB
7963
7964 kfree(io->free_iovec);
0e1b6fe3 7965 break;
e8c2bc1f 7966 }
0e1b6fe3 7967 case IORING_OP_RECVMSG:
e8c2bc1f
JA
7968 case IORING_OP_SENDMSG: {
7969 struct io_async_msghdr *io = req->async_data;
257e84a5
PB
7970
7971 kfree(io->free_iov);
0e1b6fe3 7972 break;
e8c2bc1f 7973 }
f3cd4850
JA
7974 case IORING_OP_OPENAT:
7975 case IORING_OP_OPENAT2:
7976 if (req->open.filename)
7977 putname(req->open.filename);
7978 break;
80a261fd
JA
7979 case IORING_OP_RENAMEAT:
7980 putname(req->rename.oldpath);
7981 putname(req->rename.newpath);
7982 break;
14a1143b
JA
7983 case IORING_OP_UNLINKAT:
7984 putname(req->unlink.filename);
7985 break;
e34a02dc
DK
7986 case IORING_OP_MKDIRAT:
7987 putname(req->mkdir.filename);
7988 break;
7a8721f8
DK
7989 case IORING_OP_SYMLINKAT:
7990 putname(req->symlink.oldpath);
7991 putname(req->symlink.newpath);
7992 break;
cf30da90
DK
7993 case IORING_OP_LINKAT:
7994 putname(req->hardlink.oldpath);
7995 putname(req->hardlink.newpath);
7996 break;
1b6fe6e0
SR
7997 case IORING_OP_STATX:
7998 if (req->statx.filename)
7999 putname(req->statx.filename);
8000 break;
e9621e2b
SR
8001 case IORING_OP_SETXATTR:
8002 case IORING_OP_FSETXATTR:
a56834e0
SR
8003 case IORING_OP_GETXATTR:
8004 case IORING_OP_FGETXATTR:
e9621e2b
SR
8005 __io_xattr_finish(req);
8006 break;
0e1b6fe3 8007 }
99bc4c38 8008 }
75652a30
JA
8009 if ((req->flags & REQ_F_POLLED) && req->apoll) {
8010 kfree(req->apoll->double_poll);
8011 kfree(req->apoll);
8012 req->apoll = NULL;
8013 }
c854357b 8014 if (req->flags & REQ_F_CREDS)
b8e64b53 8015 put_cred(req->creds);
d886e185
PB
8016 if (req->flags & REQ_F_ASYNC_DATA) {
8017 kfree(req->async_data);
8018 req->async_data = NULL;
8019 }
c854357b 8020 req->flags &= ~IO_REQ_CLEAN_FLAGS;
99bc4c38
PB
8021}
8022
6bf9c47a
JA
8023static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
8024{
8025 if (req->file || !io_op_defs[req->opcode].needs_file)
8026 return true;
8027
8028 if (req->flags & REQ_F_FIXED_FILE)
cef216fc 8029 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
6bf9c47a 8030 else
cef216fc 8031 req->file = io_file_get_normal(req, req->cqe.fd);
6bf9c47a 8032
772f5e00 8033 return !!req->file;
6bf9c47a
JA
8034}
8035
889fca73 8036static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
2b188cc1 8037{
5730b27e 8038 const struct cred *creds = NULL;
d625c6ee 8039 int ret;
2b188cc1 8040
70152140
JA
8041 if (unlikely(!io_assign_file(req, issue_flags)))
8042 return -EBADF;
8043
6878b40e 8044 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
c10d1f98 8045 creds = override_creds(req->creds);
5730b27e 8046
5bd2182d
PM
8047 if (!io_op_defs[req->opcode].audit_skip)
8048 audit_uring_entry(req->opcode);
8049
d625c6ee 8050 switch (req->opcode) {
2b188cc1 8051 case IORING_OP_NOP:
889fca73 8052 ret = io_nop(req, issue_flags);
2b188cc1
JA
8053 break;
8054 case IORING_OP_READV:
edafccee 8055 case IORING_OP_READ_FIXED:
3a6820f2 8056 case IORING_OP_READ:
889fca73 8057 ret = io_read(req, issue_flags);
edafccee 8058 break;
3529d8c2 8059 case IORING_OP_WRITEV:
edafccee 8060 case IORING_OP_WRITE_FIXED:
3a6820f2 8061 case IORING_OP_WRITE:
889fca73 8062 ret = io_write(req, issue_flags);
2b188cc1 8063 break;
c992fe29 8064 case IORING_OP_FSYNC:
45d189c6 8065 ret = io_fsync(req, issue_flags);
c992fe29 8066 break;
221c5eb2 8067 case IORING_OP_POLL_ADD:
61e98203 8068 ret = io_poll_add(req, issue_flags);
221c5eb2
JA
8069 break;
8070 case IORING_OP_POLL_REMOVE:
c5de0036 8071 ret = io_poll_update(req, issue_flags);
221c5eb2 8072 break;
5d17b4a4 8073 case IORING_OP_SYNC_FILE_RANGE:
45d189c6 8074 ret = io_sync_file_range(req, issue_flags);
5d17b4a4 8075 break;
0fa03c62 8076 case IORING_OP_SENDMSG:
889fca73 8077 ret = io_sendmsg(req, issue_flags);
062d04d7 8078 break;
fddaface 8079 case IORING_OP_SEND:
889fca73 8080 ret = io_send(req, issue_flags);
0fa03c62 8081 break;
aa1fa28f 8082 case IORING_OP_RECVMSG:
889fca73 8083 ret = io_recvmsg(req, issue_flags);
062d04d7 8084 break;
fddaface 8085 case IORING_OP_RECV:
889fca73 8086 ret = io_recv(req, issue_flags);
aa1fa28f 8087 break;
5262f567 8088 case IORING_OP_TIMEOUT:
61e98203 8089 ret = io_timeout(req, issue_flags);
5262f567 8090 break;
11365043 8091 case IORING_OP_TIMEOUT_REMOVE:
61e98203 8092 ret = io_timeout_remove(req, issue_flags);
11365043 8093 break;
17f2fe35 8094 case IORING_OP_ACCEPT:
889fca73 8095 ret = io_accept(req, issue_flags);
17f2fe35 8096 break;
f8e85cf2 8097 case IORING_OP_CONNECT:
889fca73 8098 ret = io_connect(req, issue_flags);
f8e85cf2 8099 break;
62755e35 8100 case IORING_OP_ASYNC_CANCEL:
61e98203 8101 ret = io_async_cancel(req, issue_flags);
62755e35 8102 break;
d63d1b5e 8103 case IORING_OP_FALLOCATE:
45d189c6 8104 ret = io_fallocate(req, issue_flags);
d63d1b5e 8105 break;
15b71abe 8106 case IORING_OP_OPENAT:
45d189c6 8107 ret = io_openat(req, issue_flags);
15b71abe 8108 break;
b5dba59e 8109 case IORING_OP_CLOSE:
889fca73 8110 ret = io_close(req, issue_flags);
b5dba59e 8111 break;
05f3fb3c 8112 case IORING_OP_FILES_UPDATE:
889fca73 8113 ret = io_files_update(req, issue_flags);
05f3fb3c 8114 break;
eddc7ef5 8115 case IORING_OP_STATX:
45d189c6 8116 ret = io_statx(req, issue_flags);
eddc7ef5 8117 break;
4840e418 8118 case IORING_OP_FADVISE:
45d189c6 8119 ret = io_fadvise(req, issue_flags);
4840e418 8120 break;
c1ca757b 8121 case IORING_OP_MADVISE:
45d189c6 8122 ret = io_madvise(req, issue_flags);
c1ca757b 8123 break;
cebdb986 8124 case IORING_OP_OPENAT2:
45d189c6 8125 ret = io_openat2(req, issue_flags);
cebdb986 8126 break;
3e4827b0 8127 case IORING_OP_EPOLL_CTL:
889fca73 8128 ret = io_epoll_ctl(req, issue_flags);
3e4827b0 8129 break;
7d67af2c 8130 case IORING_OP_SPLICE:
45d189c6 8131 ret = io_splice(req, issue_flags);
7d67af2c 8132 break;
ddf0322d 8133 case IORING_OP_PROVIDE_BUFFERS:
889fca73 8134 ret = io_provide_buffers(req, issue_flags);
ddf0322d 8135 break;
067524e9 8136 case IORING_OP_REMOVE_BUFFERS:
889fca73 8137 ret = io_remove_buffers(req, issue_flags);
3e4827b0 8138 break;
f2a8d5c7 8139 case IORING_OP_TEE:
45d189c6 8140 ret = io_tee(req, issue_flags);
f2a8d5c7 8141 break;
36f4fa68 8142 case IORING_OP_SHUTDOWN:
45d189c6 8143 ret = io_shutdown(req, issue_flags);
36f4fa68 8144 break;
80a261fd 8145 case IORING_OP_RENAMEAT:
45d189c6 8146 ret = io_renameat(req, issue_flags);
80a261fd 8147 break;
14a1143b 8148 case IORING_OP_UNLINKAT:
45d189c6 8149 ret = io_unlinkat(req, issue_flags);
14a1143b 8150 break;
e34a02dc
DK
8151 case IORING_OP_MKDIRAT:
8152 ret = io_mkdirat(req, issue_flags);
8153 break;
7a8721f8
DK
8154 case IORING_OP_SYMLINKAT:
8155 ret = io_symlinkat(req, issue_flags);
8156 break;
cf30da90
DK
8157 case IORING_OP_LINKAT:
8158 ret = io_linkat(req, issue_flags);
8159 break;
4f57f06c
JA
8160 case IORING_OP_MSG_RING:
8161 ret = io_msg_ring(req, issue_flags);
8162 break;
e9621e2b
SR
8163 case IORING_OP_FSETXATTR:
8164 ret = io_fsetxattr(req, issue_flags);
8165 break;
8166 case IORING_OP_SETXATTR:
8167 ret = io_setxattr(req, issue_flags);
8168 break;
a56834e0
SR
8169 case IORING_OP_FGETXATTR:
8170 ret = io_fgetxattr(req, issue_flags);
8171 break;
8172 case IORING_OP_GETXATTR:
8173 ret = io_getxattr(req, issue_flags);
8174 break;
1374e08e
JA
8175 case IORING_OP_SOCKET:
8176 ret = io_socket(req, issue_flags);
8177 break;
ee692a21
JA
8178 case IORING_OP_URING_CMD:
8179 ret = io_uring_cmd(req, issue_flags);
8180 break;
2b188cc1
JA
8181 default:
8182 ret = -EINVAL;
8183 break;
8184 }
8185
5bd2182d
PM
8186 if (!io_op_defs[req->opcode].audit_skip)
8187 audit_uring_exit(!ret, ret);
8188
5730b27e
JA
8189 if (creds)
8190 revert_creds(creds);
def596e9
JA
8191 if (ret)
8192 return ret;
b532576e 8193 /* If the op doesn't have a file, we're not polling for it */
9983028e 8194 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
9882131c 8195 io_iopoll_req_issued(req, issue_flags);
def596e9
JA
8196
8197 return 0;
2b188cc1
JA
8198}
8199
ebc11b6c
PB
8200static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
8201{
8202 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8203
8204 req = io_put_req_find_next(req);
8205 return req ? &req->work : NULL;
8206}
8207
5280f7e5 8208static void io_wq_submit_work(struct io_wq_work *work)
2b188cc1
JA
8209{
8210 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6bf9c47a 8211 const struct io_op_def *def = &io_op_defs[req->opcode];
d01905db
PB
8212 unsigned int issue_flags = IO_URING_F_UNLOCKED;
8213 bool needs_poll = false;
6bf9c47a 8214 int ret = 0, err = -ECANCELED;
2b188cc1 8215
48dcd38d
PB
8216 /* one will be dropped by ->io_free_work() after returning to io-wq */
8217 if (!(req->flags & REQ_F_REFCOUNT))
8218 __io_req_set_refcount(req, 2);
8219 else
8220 req_ref_get(req);
5d5901a3 8221
cb2d344c 8222 io_arm_ltimeout(req);
6bf9c47a 8223
dadebc35 8224 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
d01905db 8225 if (work->flags & IO_WQ_WORK_CANCEL) {
0f8da75b 8226fail:
6bf9c47a 8227 io_req_task_queue_fail(req, err);
d01905db
PB
8228 return;
8229 }
0f8da75b
PB
8230 if (!io_assign_file(req, issue_flags)) {
8231 err = -EBADF;
8232 work->flags |= IO_WQ_WORK_CANCEL;
8233 goto fail;
8234 }
31b51510 8235
d01905db 8236 if (req->flags & REQ_F_FORCE_ASYNC) {
afb7f56f
PB
8237 bool opcode_poll = def->pollin || def->pollout;
8238
8239 if (opcode_poll && file_can_poll(req->file)) {
8240 needs_poll = true;
d01905db 8241 issue_flags |= IO_URING_F_NONBLOCK;
afb7f56f 8242 }
561fb04a 8243 }
31b51510 8244
d01905db
PB
8245 do {
8246 ret = io_issue_sqe(req, issue_flags);
8247 if (ret != -EAGAIN)
8248 break;
8249 /*
8250 * We can get EAGAIN for iopolled IO even though we're
8251 * forcing a sync submission from here, since we can't
8252 * wait for request slots on the block side.
8253 */
8254 if (!needs_poll) {
8255 cond_resched();
8256 continue;
90fa0288
HX
8257 }
8258
4d9237e3 8259 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
d01905db
PB
8260 return;
8261 /* aborted or ready, in either case retry blocking */
8262 needs_poll = false;
8263 issue_flags &= ~IO_URING_F_NONBLOCK;
8264 } while (1);
31b51510 8265
a3df7698 8266 /* avoid locking problems by failing it from a clean context */
5d5901a3 8267 if (ret)
a3df7698 8268 io_req_task_queue_fail(req, ret);
2b188cc1
JA
8269}
8270
aeca241b 8271static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
042b0d85 8272 unsigned i)
65e19f54 8273{
042b0d85 8274 return &table->files[i];
dafecf19
PB
8275}
8276
65e19f54
JA
8277static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
8278 int index)
8279{
aeca241b 8280 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
65e19f54 8281
a04b0ac0 8282 return (struct file *) (slot->file_ptr & FFS_MASK);
65e19f54
JA
8283}
8284
a04b0ac0 8285static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
9a321c98
PB
8286{
8287 unsigned long file_ptr = (unsigned long) file;
8288
88459b50 8289 file_ptr |= io_file_get_flags(file);
a04b0ac0 8290 file_slot->file_ptr = file_ptr;
65e19f54
JA
8291}
8292
5106dd6e
JA
8293static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
8294 unsigned int issue_flags)
09bb8394 8295{
5106dd6e
JA
8296 struct io_ring_ctx *ctx = req->ctx;
8297 struct file *file = NULL;
ac177053 8298 unsigned long file_ptr;
09bb8394 8299
93f052cb 8300 io_ring_submit_lock(ctx, issue_flags);
5106dd6e 8301
ac177053 8302 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
5106dd6e 8303 goto out;
ac177053
PB
8304 fd = array_index_nospec(fd, ctx->nr_user_files);
8305 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
8306 file = (struct file *) (file_ptr & FFS_MASK);
8307 file_ptr &= ~FFS_MASK;
8308 /* mask in overlapping REQ_F and FFS bits */
35645ac3 8309 req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
5106dd6e
JA
8310 io_req_set_rsrc_node(req, ctx, 0);
8311out:
93f052cb 8312 io_ring_submit_unlock(ctx, issue_flags);
ac177053
PB
8313 return file;
8314}
d44f554e 8315
d5361233
JA
8316/*
8317 * Drop the file for requeue operations. Only used of req->file is the
8318 * io_uring descriptor itself.
8319 */
8320static void io_drop_inflight_file(struct io_kiocb *req)
8321{
8322 if (unlikely(req->flags & REQ_F_INFLIGHT)) {
8323 fput(req->file);
8324 req->file = NULL;
8325 req->flags &= ~REQ_F_INFLIGHT;
8326 }
8327}
8328
5106dd6e 8329static struct file *io_file_get_normal(struct io_kiocb *req, int fd)
ac177053 8330{
62906e89 8331 struct file *file = fget(fd);
ac177053 8332
cef216fc 8333 trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
09bb8394 8334
ac177053 8335 /* we don't allow fixed io_uring files */
d5361233
JA
8336 if (file && file->f_op == &io_uring_fops)
8337 req->flags |= REQ_F_INFLIGHT;
8371adf5 8338 return file;
09bb8394
JA
8339}
8340
f237c30a 8341static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
89b263f6
JA
8342{
8343 struct io_kiocb *prev = req->timeout.prev;
617a8948 8344 int ret = -ENOENT;
89b263f6
JA
8345
8346 if (prev) {
b21432b4
JA
8347 if (!(req->task->flags & PF_EXITING)) {
8348 struct io_cancel_data cd = {
8349 .ctx = req->ctx,
8350 .data = prev->cqe.user_data,
8351 };
8352
8353 ret = io_try_cancel(req, &cd);
8354 }
505657bc 8355 io_req_complete_post(req, ret ?: -ETIME, 0);
89b263f6 8356 io_put_req(prev);
89b263f6
JA
8357 } else {
8358 io_req_complete_post(req, -ETIME, 0);
8359 }
8360}
8361
2665abfd 8362static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2b188cc1 8363{
ad8a48ac
JA
8364 struct io_timeout_data *data = container_of(timer,
8365 struct io_timeout_data, timer);
90cd7e42 8366 struct io_kiocb *prev, *req = data->req;
2665abfd 8367 struct io_ring_ctx *ctx = req->ctx;
2665abfd 8368 unsigned long flags;
2665abfd 8369
89b263f6 8370 spin_lock_irqsave(&ctx->timeout_lock, flags);
90cd7e42
PB
8371 prev = req->timeout.head;
8372 req->timeout.head = NULL;
2665abfd
JA
8373
8374 /*
8375 * We don't expect the list to be empty, that will only happen if we
8376 * race with the completion of the linked work.
8377 */
447c19f3 8378 if (prev) {
f2f87370 8379 io_remove_next_linked(prev);
447c19f3
PB
8380 if (!req_ref_inc_not_zero(prev))
8381 prev = NULL;
8382 }
ef9dd637 8383 list_del(&req->timeout.list);
89b263f6
JA
8384 req->timeout.prev = prev;
8385 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
2665abfd 8386
89b263f6 8387 req->io_task_work.func = io_req_task_link_timeout;
4813c377 8388 io_req_task_work_add(req, false);
2665abfd
JA
8389 return HRTIMER_NORESTART;
8390}
8391
de968c18 8392static void io_queue_linked_timeout(struct io_kiocb *req)
2665abfd 8393{
de968c18
PB
8394 struct io_ring_ctx *ctx = req->ctx;
8395
89b263f6 8396 spin_lock_irq(&ctx->timeout_lock);
76a46e06 8397 /*
f2f87370
PB
8398 * If the back reference is NULL, then our linked request finished
8399 * before we got a chance to setup the timer
76a46e06 8400 */
90cd7e42 8401 if (req->timeout.head) {
e8c2bc1f 8402 struct io_timeout_data *data = req->async_data;
94ae5e77 8403
ad8a48ac
JA
8404 data->timer.function = io_link_timeout_fn;
8405 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
8406 data->mode);
ef9dd637 8407 list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
2665abfd 8408 }
89b263f6 8409 spin_unlock_irq(&ctx->timeout_lock);
2665abfd 8410 /* drop submission reference */
76a46e06
JA
8411 io_put_req(req);
8412}
2665abfd 8413
7bfa9bad 8414static void io_queue_async(struct io_kiocb *req, int ret)
d475a9a6
PB
8415 __must_hold(&req->ctx->uring_lock)
8416{
7bfa9bad
PB
8417 struct io_kiocb *linked_timeout;
8418
8419 if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
8420 io_req_complete_failed(req, ret);
8421 return;
8422 }
8423
8424 linked_timeout = io_prep_linked_timeout(req);
d475a9a6 8425
4d9237e3 8426 switch (io_arm_poll_handler(req, 0)) {
d475a9a6 8427 case IO_APOLL_READY:
d475a9a6
PB
8428 io_req_task_queue(req);
8429 break;
8430 case IO_APOLL_ABORTED:
8431 /*
8432 * Queued up for async execution, worker will release
8433 * submit reference when the iocb is actually submitted.
8434 */
77955efb 8435 io_queue_iowq(req, NULL);
d475a9a6 8436 break;
b1c62645 8437 case IO_APOLL_OK:
b1c62645 8438 break;
d475a9a6
PB
8439 }
8440
8441 if (linked_timeout)
8442 io_queue_linked_timeout(linked_timeout);
8443}
8444
cbc2e203 8445static inline void io_queue_sqe(struct io_kiocb *req)
282cdc86 8446 __must_hold(&req->ctx->uring_lock)
2b188cc1 8447{
e0c5c576 8448 int ret;
2b188cc1 8449
c5eef2b9 8450 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
193155c8 8451
fff4e40e
PB
8452 if (req->flags & REQ_F_COMPLETE_INLINE) {
8453 io_req_add_compl_list(req);
d9f9d284 8454 return;
fff4e40e 8455 }
491381ce
JA
8456 /*
8457 * We async punt it if the file wasn't marked NOWAIT, or if the file
8458 * doesn't support non-blocking read/write attempts
8459 */
7bfa9bad 8460 if (likely(!ret))
cb2d344c 8461 io_arm_ltimeout(req);
7bfa9bad
PB
8462 else
8463 io_queue_async(req, ret);
2b188cc1
JA
8464}
8465
4652fe3f 8466static void io_queue_sqe_fallback(struct io_kiocb *req)
282cdc86 8467 __must_hold(&req->ctx->uring_lock)
4fe2c963 8468{
17b147f6
PB
8469 if (unlikely(req->flags & REQ_F_FAIL)) {
8470 /*
8471 * We don't submit, fail them all, for that replace hardlinks
8472 * with normal links. Extra REQ_F_LINK is tolerated.
8473 */
8474 req->flags &= ~REQ_F_HARDLINK;
8475 req->flags |= REQ_F_LINK;
8476 io_req_complete_failed(req, req->cqe.res);
e0eb71dc
PB
8477 } else if (unlikely(req->ctx->drain_active)) {
8478 io_drain_req(req);
76cc33d7
PB
8479 } else {
8480 int ret = io_req_prep_async(req);
8481
8482 if (unlikely(ret))
8483 io_req_complete_failed(req, ret);
8484 else
77955efb 8485 io_queue_iowq(req, NULL);
ce35a47a 8486 }
4fe2c963
JL
8487}
8488
b16fed66
PB
8489/*
8490 * Check SQE restrictions (opcode and flags).
8491 *
8492 * Returns 'true' if SQE is allowed, 'false' otherwise.
8493 */
8494static inline bool io_check_restriction(struct io_ring_ctx *ctx,
8495 struct io_kiocb *req,
8496 unsigned int sqe_flags)
4fe2c963 8497{
b16fed66
PB
8498 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
8499 return false;
8500
8501 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
8502 ctx->restrictions.sqe_flags_required)
8503 return false;
8504
8505 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
8506 ctx->restrictions.sqe_flags_required))
8507 return false;
8508
8509 return true;
4fe2c963
JL
8510}
8511
22b2ca31
PB
8512static void io_init_req_drain(struct io_kiocb *req)
8513{
8514 struct io_ring_ctx *ctx = req->ctx;
8515 struct io_kiocb *head = ctx->submit_state.link.head;
8516
8517 ctx->drain_active = true;
8518 if (head) {
8519 /*
8520 * If we need to drain a request in the middle of a link, drain
8521 * the head request and the next request/link after the current
8522 * link. Considering sequential execution of links,
b6c7db32 8523 * REQ_F_IO_DRAIN will be maintained for every request of our
22b2ca31
PB
8524 * link.
8525 */
b6c7db32 8526 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
22b2ca31
PB
8527 ctx->drain_next = true;
8528 }
8529}
8530
b16fed66
PB
8531static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
8532 const struct io_uring_sqe *sqe)
282cdc86 8533 __must_hold(&ctx->uring_lock)
b16fed66 8534{
b16fed66 8535 unsigned int sqe_flags;
fc0ae024 8536 int personality;
4a04d1d1 8537 u8 opcode;
b16fed66 8538
864ea921 8539 /* req is partially pre-initialised, see io_preinit_req() */
4a04d1d1 8540 req->opcode = opcode = READ_ONCE(sqe->opcode);
b16fed66
PB
8541 /* same numerical values with corresponding REQ_F_*, safe to copy */
8542 req->flags = sqe_flags = READ_ONCE(sqe->flags);
cef216fc 8543 req->cqe.user_data = READ_ONCE(sqe->user_data);
b16fed66 8544 req->file = NULL;
c1bdf8ed 8545 req->rsrc_node = NULL;
b16fed66 8546 req->task = current;
b16fed66 8547
4a04d1d1
PB
8548 if (unlikely(opcode >= IORING_OP_LAST)) {
8549 req->opcode = 0;
b16fed66 8550 return -EINVAL;
4a04d1d1 8551 }
68fe256a
PB
8552 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
8553 /* enforce forwards compatibility on users */
8554 if (sqe_flags & ~SQE_VALID_FLAGS)
8555 return -EINVAL;
4e906702
JA
8556 if (sqe_flags & IOSQE_BUFFER_SELECT) {
8557 if (!io_op_defs[opcode].buffer_select)
8558 return -EOPNOTSUPP;
8559 req->buf_index = READ_ONCE(sqe->buf_group);
8560 }
5562a8d7
PB
8561 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
8562 ctx->drain_disabled = true;
8563 if (sqe_flags & IOSQE_IO_DRAIN) {
8564 if (ctx->drain_disabled)
8565 return -EOPNOTSUPP;
22b2ca31 8566 io_init_req_drain(req);
5562a8d7 8567 }
2a56a9bd
PB
8568 }
8569 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
8570 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
8571 return -EACCES;
8572 /* knock it to the slow queue path, will be drained there */
8573 if (ctx->drain_active)
8574 req->flags |= REQ_F_FORCE_ASYNC;
8575 /* if there is no link, we're at "next" request and need to drain */
8576 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
8577 ctx->drain_next = false;
8578 ctx->drain_active = true;
b6c7db32 8579 req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
2a56a9bd 8580 }
68fe256a 8581 }
b16fed66 8582
73911426
JA
8583 if (!io_op_defs[opcode].ioprio && sqe->ioprio)
8584 return -EINVAL;
8585 if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
8586 return -EINVAL;
8587
4a04d1d1 8588 if (io_op_defs[opcode].needs_file) {
6d63416d
PB
8589 struct io_submit_state *state = &ctx->submit_state;
8590
cef216fc 8591 req->cqe.fd = READ_ONCE(sqe->fd);
6bf9c47a 8592
6d63416d
PB
8593 /*
8594 * Plug now if we have more than 2 IO left after this, and the
8595 * target is potentially a read/write to block based storage.
8596 */
4a04d1d1 8597 if (state->need_plug && io_op_defs[opcode].plug) {
6d63416d
PB
8598 state->plug_started = true;
8599 state->need_plug = false;
5ca7a8b3 8600 blk_start_plug_nr_ios(&state->plug, state->submit_nr);
6d63416d 8601 }
b16fed66 8602 }
863e0560 8603
003e8dcc
JA
8604 personality = READ_ONCE(sqe->personality);
8605 if (personality) {
cdab10bf
LT
8606 int ret;
8607
c10d1f98
PB
8608 req->creds = xa_load(&ctx->personalities, personality);
8609 if (!req->creds)
003e8dcc 8610 return -EINVAL;
c10d1f98 8611 get_cred(req->creds);
cdc1404a
PM
8612 ret = security_uring_override_creds(req->creds);
8613 if (ret) {
8614 put_cred(req->creds);
8615 return ret;
8616 }
b8e64b53 8617 req->flags |= REQ_F_CREDS;
003e8dcc 8618 }
b16fed66 8619
fc0ae024 8620 return io_req_prep(req, sqe);
b16fed66
PB
8621}
8622
df3becde
PB
8623static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
8624 struct io_kiocb *req, int ret)
8625{
8626 struct io_ring_ctx *ctx = req->ctx;
8627 struct io_submit_link *link = &ctx->submit_state.link;
8628 struct io_kiocb *head = link->head;
8629
8630 trace_io_uring_req_failed(sqe, ctx, req, ret);
8631
8632 /*
8633 * Avoid breaking links in the middle as it renders links with SQPOLL
8634 * unusable. Instead of failing eagerly, continue assembling the link if
8635 * applicable and mark the head with REQ_F_FAIL. The link flushing code
8636 * should find the flag and handle the rest.
8637 */
8638 req_fail_link_node(req, ret);
8639 if (head && !(head->flags & REQ_F_FAIL))
8640 req_fail_link_node(head, -ECANCELED);
8641
8642 if (!(req->flags & IO_REQ_LINK_FLAGS)) {
8643 if (head) {
8644 link->last->link = req;
8645 link->head = NULL;
8646 req = head;
8647 }
8648 io_queue_sqe_fallback(req);
8649 return ret;
003e8dcc 8650 }
b16fed66 8651
df3becde
PB
8652 if (head)
8653 link->last->link = req;
8654 else
8655 link->head = req;
8656 link->last = req;
8657 return 0;
b16fed66
PB
8658}
8659
df3becde 8660static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
a1ab7b35 8661 const struct io_uring_sqe *sqe)
282cdc86 8662 __must_hold(&ctx->uring_lock)
9e645e11 8663{
a1ab7b35 8664 struct io_submit_link *link = &ctx->submit_state.link;
ef4ff581 8665 int ret;
9e645e11 8666
a6b8cadc 8667 ret = io_init_req(ctx, req, sqe);
df3becde
PB
8668 if (unlikely(ret))
8669 return io_submit_fail_init(sqe, req, ret);
441b8a78 8670
be7053b7 8671 /* don't need @sqe from now on */
cef216fc 8672 trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
236daeae
OL
8673 req->flags, true,
8674 ctx->flags & IORING_SETUP_SQPOLL);
a6b8cadc 8675
9e645e11
JA
8676 /*
8677 * If we already have a head request, queue this one for async
8678 * submittal once the head completes. If we don't have a head but
8679 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
8680 * submitted sync once the chain is complete. If none of those
8681 * conditions are true (normal request), then just queue it.
8682 */
924a07e4 8683 if (unlikely(link->head)) {
df3becde
PB
8684 ret = io_req_prep_async(req);
8685 if (unlikely(ret))
8686 return io_submit_fail_init(sqe, req, ret);
8687
8688 trace_io_uring_link(ctx, req, link->head);
f2f87370 8689 link->last->link = req;
863e0560 8690 link->last = req;
32fe525b 8691
da1a08c5 8692 if (req->flags & IO_REQ_LINK_FLAGS)
f15a3431 8693 return 0;
df3becde
PB
8694 /* last request of the link, flush it */
8695 req = link->head;
f15a3431 8696 link->head = NULL;
924a07e4
PB
8697 if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
8698 goto fallback;
8699
8700 } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
8701 REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
8702 if (req->flags & IO_REQ_LINK_FLAGS) {
8703 link->head = req;
8704 link->last = req;
8705 } else {
8706fallback:
8707 io_queue_sqe_fallback(req);
8708 }
f15a3431 8709 return 0;
9e645e11 8710 }
2e6e1fde 8711
f15a3431 8712 io_queue_sqe(req);
1d4240cc 8713 return 0;
9e645e11
JA
8714}
8715
9a56a232
JA
8716/*
8717 * Batched submission is done, ensure local IO is flushed out.
8718 */
553deffd 8719static void io_submit_state_end(struct io_ring_ctx *ctx)
9a56a232 8720{
553deffd
PB
8721 struct io_submit_state *state = &ctx->submit_state;
8722
e126391c
PB
8723 if (unlikely(state->link.head))
8724 io_queue_sqe_fallback(state->link.head);
553deffd 8725 /* flush only after queuing links as they can generate completions */
c450178d 8726 io_submit_flush_completions(ctx);
27926b68
JA
8727 if (state->plug_started)
8728 blk_finish_plug(&state->plug);
9a56a232
JA
8729}
8730
8731/*
8732 * Start submission side cache.
8733 */
8734static void io_submit_state_start(struct io_submit_state *state,
ba88ff11 8735 unsigned int max_ios)
9a56a232 8736{
27926b68 8737 state->plug_started = false;
4b628aeb 8738 state->need_plug = max_ios > 2;
5ca7a8b3 8739 state->submit_nr = max_ios;
a1ab7b35
PB
8740 /* set only head, no need to init link_last in advance */
8741 state->link.head = NULL;
9a56a232
JA
8742}
8743
2b188cc1
JA
8744static void io_commit_sqring(struct io_ring_ctx *ctx)
8745{
75b28aff 8746 struct io_rings *rings = ctx->rings;
2b188cc1 8747
caf582c6
PB
8748 /*
8749 * Ensure any loads from the SQEs are done at this point,
8750 * since once we write the new head, the application could
8751 * write new data to them.
8752 */
8753 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2b188cc1
JA
8754}
8755
2b188cc1 8756/*
dd9ae8a0 8757 * Fetch an sqe, if one is available. Note this returns a pointer to memory
2b188cc1
JA
8758 * that is mapped by userspace. This means that care needs to be taken to
8759 * ensure that reads are stable, as we cannot rely on userspace always
8760 * being a good citizen. If members of the sqe are validated and then later
8761 * used, it's important that those reads are done through READ_ONCE() to
8762 * prevent a re-load down the line.
8763 */
709b302f 8764static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2b188cc1 8765{
ea5ab3b5 8766 unsigned head, mask = ctx->sq_entries - 1;
17d3aeb3 8767 unsigned sq_idx = ctx->cached_sq_head++ & mask;
2b188cc1
JA
8768
8769 /*
8770 * The cached sq head (or cq tail) serves two purposes:
8771 *
8772 * 1) allows us to batch the cost of updating the user visible
8773 * head updates.
8774 * 2) allows the kernel side to track the head on its own, even
8775 * though the application is the one updating it.
8776 */
17d3aeb3 8777 head = READ_ONCE(ctx->sq_array[sq_idx]);
ebdeb7c0
JA
8778 if (likely(head < ctx->sq_entries)) {
8779 /* double index for 128-byte SQEs, twice as long */
8780 if (ctx->flags & IORING_SETUP_SQE128)
8781 head <<= 1;
709b302f 8782 return &ctx->sq_sqes[head];
ebdeb7c0 8783 }
2b188cc1
JA
8784
8785 /* drop invalid entries */
15641e42
PB
8786 ctx->cq_extra--;
8787 WRITE_ONCE(ctx->rings->sq_dropped,
8788 READ_ONCE(ctx->rings->sq_dropped) + 1);
709b302f
PB
8789 return NULL;
8790}
8791
0f212204 8792static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
282cdc86 8793 __must_hold(&ctx->uring_lock)
6c271ce2 8794{
69629809 8795 unsigned int entries = io_sqring_entries(ctx);
8e6971a8
PB
8796 unsigned int left;
8797 int ret;
6c271ce2 8798
51d48dab 8799 if (unlikely(!entries))
69629809 8800 return 0;
ee7d46d9 8801 /* make sure SQ entry isn't read before tail */
8e6971a8
PB
8802 ret = left = min3(nr, ctx->sq_entries, entries);
8803 io_get_task_refs(left);
8804 io_submit_state_start(&ctx->submit_state, left);
6c271ce2 8805
69629809 8806 do {
3529d8c2 8807 const struct io_uring_sqe *sqe;
196be95c 8808 struct io_kiocb *req;
fb5ccc98 8809
8e6971a8 8810 if (unlikely(!io_alloc_req_refill(ctx)))
fb5ccc98 8811 break;
a33ae9ce 8812 req = io_alloc_req(ctx);
4fccfcbb
PB
8813 sqe = io_get_sqe(ctx);
8814 if (unlikely(!sqe)) {
fa05457a 8815 io_req_add_to_cache(req, ctx);
4fccfcbb
PB
8816 break;
8817 }
6c271ce2 8818
1cd15904
PB
8819 /*
8820 * Continue submitting even for sqe failure if the
8821 * ring was setup with IORING_SETUP_SUBMIT_ALL
8822 */
8823 if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
8824 !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
8825 left--;
8826 break;
bcbb7bf6 8827 }
1cd15904 8828 } while (--left);
9466f437 8829
8e6971a8
PB
8830 if (unlikely(left)) {
8831 ret -= left;
8832 /* try again if it submitted nothing and can't allocate a req */
8833 if (!ret && io_req_cache_empty(ctx))
8834 ret = -EAGAIN;
8835 current->io_uring->cached_refs += left;
9466f437 8836 }
6c271ce2 8837
553deffd 8838 io_submit_state_end(ctx);
ae9428ca
PB
8839 /* Commit SQ ring head once we've consumed and submitted all SQEs */
8840 io_commit_sqring(ctx);
8e6971a8 8841 return ret;
6c271ce2
JA
8842}
8843
e4b6d902
PB
8844static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
8845{
8846 return READ_ONCE(sqd->state);
8847}
8848
08369246 8849static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6c271ce2 8850{
c8d1ba58 8851 unsigned int to_submit;
bdcd3eab 8852 int ret = 0;
6c271ce2 8853
c8d1ba58 8854 to_submit = io_sqring_entries(ctx);
e95eee2d 8855 /* if we're handling multiple rings, cap submit size for fairness */
4ce8ad95
OL
8856 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
8857 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
e95eee2d 8858
5eef4e87 8859 if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
948e1947
PB
8860 const struct cred *creds = NULL;
8861
8862 if (ctx->sq_creds != current_cred())
8863 creds = override_creds(ctx->sq_creds);
a4c0b3de 8864
c8d1ba58 8865 mutex_lock(&ctx->uring_lock);
5eef4e87 8866 if (!wq_list_empty(&ctx->iopoll_list))
5ba3c874 8867 io_do_iopoll(ctx, true);
906a3c6f 8868
3b763ba1
PB
8869 /*
8870 * Don't submit if refs are dying, good for io_uring_register(),
8871 * but also it is relied upon by io_ring_exit_work()
8872 */
0298ef96
PB
8873 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
8874 !(ctx->flags & IORING_SETUP_R_DISABLED))
08369246 8875 ret = io_submit_sqes(ctx, to_submit);
c8d1ba58 8876 mutex_unlock(&ctx->uring_lock);
cb318216 8877
acfb381d
PB
8878 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
8879 wake_up(&ctx->sqo_sq_wait);
948e1947
PB
8880 if (creds)
8881 revert_creds(creds);
acfb381d 8882 }
6c271ce2 8883
08369246
XW
8884 return ret;
8885}
6c271ce2 8886
c072481d 8887static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
08369246
XW
8888{
8889 struct io_ring_ctx *ctx;
8890 unsigned sq_thread_idle = 0;
6c271ce2 8891
c9dca27d
PB
8892 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
8893 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
08369246 8894 sqd->sq_thread_idle = sq_thread_idle;
c8d1ba58 8895}
6c271ce2 8896
e4b6d902
PB
8897static bool io_sqd_handle_event(struct io_sq_data *sqd)
8898{
8899 bool did_sig = false;
8900 struct ksignal ksig;
8901
8902 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
8903 signal_pending(current)) {
8904 mutex_unlock(&sqd->lock);
8905 if (signal_pending(current))
8906 did_sig = get_signal(&ksig);
8907 cond_resched();
8908 mutex_lock(&sqd->lock);
8909 }
e4b6d902
PB
8910 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
8911}
8912
c8d1ba58
JA
8913static int io_sq_thread(void *data)
8914{
69fb2131
JA
8915 struct io_sq_data *sqd = data;
8916 struct io_ring_ctx *ctx;
a0d9205f 8917 unsigned long timeout = 0;
37d1e2e3 8918 char buf[TASK_COMM_LEN];
08369246 8919 DEFINE_WAIT(wait);
6c271ce2 8920
696ee88a 8921 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
37d1e2e3 8922 set_task_comm(current, buf);
37d1e2e3
JA
8923
8924 if (sqd->sq_cpu != -1)
8925 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
8926 else
8927 set_cpus_allowed_ptr(current, cpu_online_mask);
8928 current->flags |= PF_NO_SETAFFINITY;
8929
5bd2182d
PM
8930 audit_alloc_kernel(current);
8931
09a6f4ef 8932 mutex_lock(&sqd->lock);
e4b6d902 8933 while (1) {
1a924a80 8934 bool cap_entries, sqt_spin = false;
c1edbf5f 8935
e4b6d902
PB
8936 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
8937 if (io_sqd_handle_event(sqd))
c7d95613 8938 break;
08369246
XW
8939 timeout = jiffies + sqd->sq_thread_idle;
8940 }
e4b6d902 8941
e95eee2d 8942 cap_entries = !list_is_singular(&sqd->ctx_list);
69fb2131 8943 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
948e1947 8944 int ret = __io_sq_thread(ctx, cap_entries);
7c30f36a 8945
5eef4e87 8946 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
08369246 8947 sqt_spin = true;
69fb2131 8948 }
dd432ea5
PB
8949 if (io_run_task_work())
8950 sqt_spin = true;
6c271ce2 8951
08369246 8952 if (sqt_spin || !time_after(jiffies, timeout)) {
c8d1ba58 8953 cond_resched();
08369246
XW
8954 if (sqt_spin)
8955 timeout = jiffies + sqd->sq_thread_idle;
8956 continue;
8957 }
8958
08369246 8959 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7f62d40d 8960 if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
1a924a80
PB
8961 bool needs_sched = true;
8962
724cb4f9 8963 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
3a4b89a2
JA
8964 atomic_or(IORING_SQ_NEED_WAKEUP,
8965 &ctx->rings->sq_flags);
724cb4f9 8966 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
5eef4e87 8967 !wq_list_empty(&ctx->iopoll_list)) {
724cb4f9
HX
8968 needs_sched = false;
8969 break;
8970 }
649bb75d
AK
8971
8972 /*
8973 * Ensure the store of the wakeup flag is not
8974 * reordered with the load of the SQ tail
8975 */
f2e030dd 8976 smp_mb__after_atomic();
649bb75d 8977
724cb4f9
HX
8978 if (io_sqring_entries(ctx)) {
8979 needs_sched = false;
8980 break;
8981 }
8982 }
8983
8984 if (needs_sched) {
8985 mutex_unlock(&sqd->lock);
8986 schedule();
8987 mutex_lock(&sqd->lock);
8988 }
69fb2131 8989 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
3a4b89a2
JA
8990 atomic_andnot(IORING_SQ_NEED_WAKEUP,
8991 &ctx->rings->sq_flags);
6c271ce2 8992 }
08369246
XW
8993
8994 finish_wait(&sqd->wait, &wait);
8995 timeout = jiffies + sqd->sq_thread_idle;
6c271ce2 8996 }
28cea78a 8997
78cc687b 8998 io_uring_cancel_generic(true, sqd);
37d1e2e3 8999 sqd->thread = NULL;
05962f95 9000 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
3a4b89a2 9001 atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
521d6a73 9002 io_run_task_work();
734551df
PB
9003 mutex_unlock(&sqd->lock);
9004
5bd2182d
PM
9005 audit_free(current);
9006
37d1e2e3
JA
9007 complete(&sqd->exited);
9008 do_exit(0);
6c271ce2
JA
9009}
9010
bda52162
JA
9011struct io_wait_queue {
9012 struct wait_queue_entry wq;
9013 struct io_ring_ctx *ctx;
5fd46178 9014 unsigned cq_tail;
bda52162
JA
9015 unsigned nr_timeouts;
9016};
9017
6c503150 9018static inline bool io_should_wake(struct io_wait_queue *iowq)
bda52162
JA
9019{
9020 struct io_ring_ctx *ctx = iowq->ctx;
5fd46178 9021 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
bda52162
JA
9022
9023 /*
d195a66e 9024 * Wake up if we have enough events, or if a timeout occurred since we
bda52162
JA
9025 * started waiting. For timeouts, we always want to return to userspace,
9026 * regardless of event count.
9027 */
5fd46178 9028 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
bda52162
JA
9029}
9030
9031static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
9032 int wake_flags, void *key)
9033{
9034 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
9035 wq);
9036
6c503150
PB
9037 /*
9038 * Cannot safely flush overflowed CQEs from here, ensure we wake up
9039 * the task, and the next invocation will do it.
9040 */
10988a0a
DY
9041 if (io_should_wake(iowq) ||
9042 test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
6c503150
PB
9043 return autoremove_wake_function(curr, mode, wake_flags, key);
9044 return -1;
bda52162
JA
9045}
9046
af9c1a44
JA
9047static int io_run_task_work_sig(void)
9048{
9049 if (io_run_task_work())
9050 return 1;
0b8cfa97 9051 if (test_thread_flag(TIF_NOTIFY_SIGNAL))
792ee0f6 9052 return -ERESTARTSYS;
c5020bc8
OL
9053 if (task_sigpending(current))
9054 return -EINTR;
9055 return 0;
af9c1a44
JA
9056}
9057
eeb60b9a
PB
9058/* when returns >0, the caller should retry */
9059static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
9060 struct io_wait_queue *iowq,
22833966 9061 ktime_t timeout)
eeb60b9a
PB
9062{
9063 int ret;
155bc950 9064 unsigned long check_cq;
eeb60b9a
PB
9065
9066 /* make sure we run task_work before checking for signals */
9067 ret = io_run_task_work_sig();
9068 if (ret || io_should_wake(iowq))
9069 return ret;
155bc950 9070 check_cq = READ_ONCE(ctx->check_cq);
eeb60b9a 9071 /* let the caller flush overflows, retry */
155bc950 9072 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
eeb60b9a 9073 return 1;
155bc950
DY
9074 if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
9075 return -EBADR;
22833966
JA
9076 if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
9077 return -ETIME;
9078 return 1;
eeb60b9a
PB
9079}
9080
2b188cc1
JA
9081/*
9082 * Wait until events become available, if we don't already have some. The
9083 * application must reap them itself, as they reside on the shared cq ring.
9084 */
9085static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
c73ebb68
HX
9086 const sigset_t __user *sig, size_t sigsz,
9087 struct __kernel_timespec __user *uts)
2b188cc1 9088{
90291099 9089 struct io_wait_queue iowq;
75b28aff 9090 struct io_rings *rings = ctx->rings;
22833966 9091 ktime_t timeout = KTIME_MAX;
c1d5a224 9092 int ret;
2b188cc1 9093
b41e9852 9094 do {
90f67366 9095 io_cqring_overflow_flush(ctx);
6c503150 9096 if (io_cqring_events(ctx) >= min_events)
b41e9852 9097 return 0;
4c6e277c 9098 if (!io_run_task_work())
b41e9852 9099 break;
b41e9852 9100 } while (1);
2b188cc1
JA
9101
9102 if (sig) {
9e75ad5d
AB
9103#ifdef CONFIG_COMPAT
9104 if (in_compat_syscall())
9105 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
b772434b 9106 sigsz);
9e75ad5d
AB
9107 else
9108#endif
b772434b 9109 ret = set_user_sigmask(sig, sigsz);
9e75ad5d 9110
2b188cc1
JA
9111 if (ret)
9112 return ret;
9113 }
9114
950e79dd
OL
9115 if (uts) {
9116 struct timespec64 ts;
9117
9118 if (get_timespec64(&ts, uts))
9119 return -EFAULT;
9120 timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
9121 }
9122
90291099
PB
9123 init_waitqueue_func_entry(&iowq.wq, io_wake_function);
9124 iowq.wq.private = current;
9125 INIT_LIST_HEAD(&iowq.wq.entry);
9126 iowq.ctx = ctx;
bda52162 9127 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
5fd46178 9128 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
90291099 9129
c826bd7a 9130 trace_io_uring_cqring_wait(ctx, min_events);
bda52162 9131 do {
ca0a2651 9132 /* if we can't even flush overflow, don't wait for more */
90f67366 9133 if (!io_cqring_overflow_flush(ctx)) {
ca0a2651
JA
9134 ret = -EBUSY;
9135 break;
9136 }
311997b3 9137 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
bda52162 9138 TASK_INTERRUPTIBLE);
22833966 9139 ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
ca0a2651 9140 cond_resched();
eeb60b9a 9141 } while (ret > 0);
bda52162 9142
b4f20bb4 9143 finish_wait(&ctx->cq_wait, &iowq.wq);
b7db41c9 9144 restore_saved_sigmask_unless(ret == -EINTR);
2b188cc1 9145
75b28aff 9146 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2b188cc1
JA
9147}
9148
9123c8ff 9149static void io_free_page_table(void **table, size_t size)
05f3fb3c 9150{
9123c8ff 9151 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
05f3fb3c 9152
846a4ef2 9153 for (i = 0; i < nr_tables; i++)
9123c8ff
PB
9154 kfree(table[i]);
9155 kfree(table);
9156}
9157
c072481d 9158static __cold void **io_alloc_page_table(size_t size)
9123c8ff
PB
9159{
9160 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
9161 size_t init_size = size;
9162 void **table;
9163
0bea96f5 9164 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
9123c8ff
PB
9165 if (!table)
9166 return NULL;
9167
9168 for (i = 0; i < nr_tables; i++) {
27f6b318 9169 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
9123c8ff 9170
0bea96f5 9171 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
9123c8ff
PB
9172 if (!table[i]) {
9173 io_free_page_table(table, init_size);
9174 return NULL;
9175 }
9176 size -= this_size;
9177 }
9178 return table;
05f3fb3c
JA
9179}
9180
28a9fe25 9181static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
1642b445 9182{
28a9fe25
PB
9183 percpu_ref_exit(&ref_node->refs);
9184 kfree(ref_node);
1642b445
PB
9185}
9186
c072481d 9187static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
b9bd2bea
PB
9188{
9189 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
9190 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
9191 unsigned long flags;
9192 bool first_add = false;
b36a2050 9193 unsigned long delay = HZ;
b9bd2bea
PB
9194
9195 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
9196 node->done = true;
9197
b36a2050
DY
9198 /* if we are mid-quiesce then do not delay */
9199 if (node->rsrc_data->quiesce)
9200 delay = 0;
9201
b9bd2bea
PB
9202 while (!list_empty(&ctx->rsrc_ref_list)) {
9203 node = list_first_entry(&ctx->rsrc_ref_list,
9204 struct io_rsrc_node, node);
9205 /* recycle ref nodes in order */
9206 if (!node->done)
9207 break;
9208 list_del(&node->node);
9209 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
9210 }
9211 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
9212
9213 if (first_add)
b36a2050 9214 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
b9bd2bea
PB
9215}
9216
f6133fbd 9217static struct io_rsrc_node *io_rsrc_node_alloc(void)
b9bd2bea
PB
9218{
9219 struct io_rsrc_node *ref_node;
9220
9221 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
9222 if (!ref_node)
9223 return NULL;
9224
9225 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
9226 0, GFP_KERNEL)) {
9227 kfree(ref_node);
9228 return NULL;
9229 }
9230 INIT_LIST_HEAD(&ref_node->node);
9231 INIT_LIST_HEAD(&ref_node->rsrc_list);
9232 ref_node->done = false;
9233 return ref_node;
9234}
9235
a7f0ed5a
PB
9236static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
9237 struct io_rsrc_data *data_to_kill)
ab409402 9238 __must_hold(&ctx->uring_lock)
6b06314c 9239{
a7f0ed5a
PB
9240 WARN_ON_ONCE(!ctx->rsrc_backup_node);
9241 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
6b06314c 9242
ab409402
PB
9243 io_rsrc_refs_drop(ctx);
9244
a7f0ed5a
PB
9245 if (data_to_kill) {
9246 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
82fbcfa9 9247
a7f0ed5a 9248 rsrc_node->rsrc_data = data_to_kill;
4956b9ea 9249 spin_lock_irq(&ctx->rsrc_ref_lock);
a7f0ed5a 9250 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
4956b9ea 9251 spin_unlock_irq(&ctx->rsrc_ref_lock);
82fbcfa9 9252
3e942498 9253 atomic_inc(&data_to_kill->refs);
a7f0ed5a
PB
9254 percpu_ref_kill(&rsrc_node->refs);
9255 ctx->rsrc_node = NULL;
9256 }
6b06314c 9257
a7f0ed5a
PB
9258 if (!ctx->rsrc_node) {
9259 ctx->rsrc_node = ctx->rsrc_backup_node;
9260 ctx->rsrc_backup_node = NULL;
9261 }
8bad28d8
HX
9262}
9263
a7f0ed5a 9264static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
8dd03afe
PB
9265{
9266 if (ctx->rsrc_backup_node)
9267 return 0;
f6133fbd 9268 ctx->rsrc_backup_node = io_rsrc_node_alloc();
8dd03afe 9269 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
8bad28d8
HX
9270}
9271
c072481d
PB
9272static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
9273 struct io_ring_ctx *ctx)
8bad28d8
HX
9274{
9275 int ret;
05589553 9276
215c3902 9277 /* As we may drop ->uring_lock, other task may have started quiesce */
8bad28d8
HX
9278 if (data->quiesce)
9279 return -ENXIO;
05589553 9280
8bad28d8 9281 data->quiesce = true;
1ffc5422 9282 do {
a7f0ed5a 9283 ret = io_rsrc_node_switch_start(ctx);
8dd03afe 9284 if (ret)
f2303b1f 9285 break;
a7f0ed5a 9286 io_rsrc_node_switch(ctx, data);
f2303b1f 9287
3e942498
PB
9288 /* kill initial ref, already quiesced if zero */
9289 if (atomic_dec_and_test(&data->refs))
9290 break;
c018db4a 9291 mutex_unlock(&ctx->uring_lock);
8bad28d8 9292 flush_delayed_work(&ctx->rsrc_put_work);
1ffc5422 9293 ret = wait_for_completion_interruptible(&data->done);
c018db4a
JA
9294 if (!ret) {
9295 mutex_lock(&ctx->uring_lock);
80912cef
DY
9296 if (atomic_read(&data->refs) > 0) {
9297 /*
9298 * it has been revived by another thread while
9299 * we were unlocked
9300 */
9301 mutex_unlock(&ctx->uring_lock);
9302 } else {
9303 break;
9304 }
c018db4a 9305 }
8bad28d8 9306
3e942498
PB
9307 atomic_inc(&data->refs);
9308 /* wait for all works potentially completing data->done */
9309 flush_delayed_work(&ctx->rsrc_put_work);
cb5e1b81 9310 reinit_completion(&data->done);
8dd03afe 9311
1ffc5422 9312 ret = io_run_task_work_sig();
8bad28d8 9313 mutex_lock(&ctx->uring_lock);
f2303b1f 9314 } while (ret >= 0);
8bad28d8 9315 data->quiesce = false;
05f3fb3c 9316
8bad28d8 9317 return ret;
d7954b2b
BM
9318}
9319
2d091d62
PB
9320static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
9321{
9322 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
9323 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
9324
9325 return &data->tags[table_idx][off];
9326}
9327
44b31f2f 9328static void io_rsrc_data_free(struct io_rsrc_data *data)
1ad555c6 9329{
2d091d62
PB
9330 size_t size = data->nr * sizeof(data->tags[0][0]);
9331
9332 if (data->tags)
9333 io_free_page_table((void **)data->tags, size);
44b31f2f
PB
9334 kfree(data);
9335}
9336
c072481d
PB
9337static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
9338 u64 __user *utags, unsigned nr,
9339 struct io_rsrc_data **pdata)
1ad555c6 9340{
b895c9a6 9341 struct io_rsrc_data *data;
2d091d62 9342 int ret = -ENOMEM;
d878c816 9343 unsigned i;
1ad555c6
BM
9344
9345 data = kzalloc(sizeof(*data), GFP_KERNEL);
9346 if (!data)
d878c816 9347 return -ENOMEM;
2d091d62 9348 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
b60c8dce 9349 if (!data->tags) {
1ad555c6 9350 kfree(data);
d878c816
PB
9351 return -ENOMEM;
9352 }
2d091d62
PB
9353
9354 data->nr = nr;
9355 data->ctx = ctx;
9356 data->do_put = do_put;
d878c816 9357 if (utags) {
2d091d62 9358 ret = -EFAULT;
d878c816 9359 for (i = 0; i < nr; i++) {
fdd1dc31
CIK
9360 u64 *tag_slot = io_get_tag_slot(data, i);
9361
9362 if (copy_from_user(tag_slot, &utags[i],
9363 sizeof(*tag_slot)))
2d091d62 9364 goto fail;
d878c816 9365 }
1ad555c6 9366 }
b60c8dce 9367
3e942498 9368 atomic_set(&data->refs, 1);
1ad555c6 9369 init_completion(&data->done);
d878c816
PB
9370 *pdata = data;
9371 return 0;
2d091d62
PB
9372fail:
9373 io_rsrc_data_free(data);
9374 return ret;
1ad555c6
BM
9375}
9376
9123c8ff
PB
9377static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
9378{
0bea96f5
PB
9379 table->files = kvcalloc(nr_files, sizeof(table->files[0]),
9380 GFP_KERNEL_ACCOUNT);
9123c8ff
PB
9381 return !!table->files;
9382}
9383
042b0d85 9384static void io_free_file_tables(struct io_file_table *table)
9123c8ff 9385{
042b0d85 9386 kvfree(table->files);
9123c8ff
PB
9387 table->files = NULL;
9388}
9389
fff4db76 9390static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
1ad555c6 9391{
69cc1b6f 9392#if !defined(IO_URING_SCM_ALL)
1f59bc0f
PB
9393 int i;
9394
9395 for (i = 0; i < ctx->nr_user_files; i++) {
9396 struct file *file = io_file_from_index(ctx, i);
9397
5e45690a
JA
9398 if (!file)
9399 continue;
9400 if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
1f59bc0f 9401 continue;
1f59bc0f
PB
9402 fput(file);
9403 }
5e45690a 9404#endif
1f59bc0f 9405
fff4db76
PB
9406#if defined(CONFIG_UNIX)
9407 if (ctx->ring_sock) {
9408 struct sock *sock = ctx->ring_sock->sk;
9409 struct sk_buff *skb;
9410
9411 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
9412 kfree_skb(skb);
9413 }
fff4db76 9414#endif
042b0d85 9415 io_free_file_tables(&ctx->file_table);
44b31f2f 9416 io_rsrc_data_free(ctx->file_data);
fff4db76
PB
9417 ctx->file_data = NULL;
9418 ctx->nr_user_files = 0;
1ad555c6
BM
9419}
9420
d7954b2b
BM
9421static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
9422{
d7954b2b
BM
9423 int ret;
9424
08480400 9425 if (!ctx->file_data)
d7954b2b 9426 return -ENXIO;
08480400
PB
9427 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
9428 if (!ret)
9429 __io_sqe_files_unregister(ctx);
9430 return ret;
6b06314c
JA
9431}
9432
37d1e2e3 9433static void io_sq_thread_unpark(struct io_sq_data *sqd)
09a6f4ef 9434 __releases(&sqd->lock)
37d1e2e3 9435{
521d6a73
PB
9436 WARN_ON_ONCE(sqd->thread == current);
9437
9e138a48
PB
9438 /*
9439 * Do the dance but not conditional clear_bit() because it'd race with
9440 * other threads incrementing park_pending and setting the bit.
9441 */
37d1e2e3 9442 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
9e138a48
PB
9443 if (atomic_dec_return(&sqd->park_pending))
9444 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 9445 mutex_unlock(&sqd->lock);
37d1e2e3
JA
9446}
9447
86e0d676 9448static void io_sq_thread_park(struct io_sq_data *sqd)
09a6f4ef 9449 __acquires(&sqd->lock)
37d1e2e3 9450{
521d6a73
PB
9451 WARN_ON_ONCE(sqd->thread == current);
9452
9e138a48 9453 atomic_inc(&sqd->park_pending);
86e0d676 9454 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
09a6f4ef 9455 mutex_lock(&sqd->lock);
05962f95 9456 if (sqd->thread)
86e0d676 9457 wake_up_process(sqd->thread);
37d1e2e3
JA
9458}
9459
9460static void io_sq_thread_stop(struct io_sq_data *sqd)
9461{
521d6a73 9462 WARN_ON_ONCE(sqd->thread == current);
88885f66 9463 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
521d6a73 9464
05962f95 9465 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
88885f66 9466 mutex_lock(&sqd->lock);
e8f98f24
JA
9467 if (sqd->thread)
9468 wake_up_process(sqd->thread);
09a6f4ef 9469 mutex_unlock(&sqd->lock);
05962f95 9470 wait_for_completion(&sqd->exited);
37d1e2e3
JA
9471}
9472
534ca6d6 9473static void io_put_sq_data(struct io_sq_data *sqd)
6c271ce2 9474{
534ca6d6 9475 if (refcount_dec_and_test(&sqd->refs)) {
9e138a48
PB
9476 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
9477
37d1e2e3
JA
9478 io_sq_thread_stop(sqd);
9479 kfree(sqd);
9480 }
9481}
9482
9483static void io_sq_thread_finish(struct io_ring_ctx *ctx)
9484{
9485 struct io_sq_data *sqd = ctx->sq_data;
9486
9487 if (sqd) {
05962f95 9488 io_sq_thread_park(sqd);
521d6a73 9489 list_del_init(&ctx->sqd_list);
37d1e2e3 9490 io_sqd_update_thread_idle(sqd);
05962f95 9491 io_sq_thread_unpark(sqd);
37d1e2e3
JA
9492
9493 io_put_sq_data(sqd);
9494 ctx->sq_data = NULL;
534ca6d6
JA
9495 }
9496}
9497
aa06165d
JA
9498static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
9499{
9500 struct io_ring_ctx *ctx_attach;
9501 struct io_sq_data *sqd;
9502 struct fd f;
9503
9504 f = fdget(p->wq_fd);
9505 if (!f.file)
9506 return ERR_PTR(-ENXIO);
9507 if (f.file->f_op != &io_uring_fops) {
9508 fdput(f);
9509 return ERR_PTR(-EINVAL);
9510 }
9511
9512 ctx_attach = f.file->private_data;
9513 sqd = ctx_attach->sq_data;
9514 if (!sqd) {
9515 fdput(f);
9516 return ERR_PTR(-EINVAL);
9517 }
5c2469e0
JA
9518 if (sqd->task_tgid != current->tgid) {
9519 fdput(f);
9520 return ERR_PTR(-EPERM);
9521 }
aa06165d
JA
9522
9523 refcount_inc(&sqd->refs);
9524 fdput(f);
9525 return sqd;
9526}
9527
26984fbf
PB
9528static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
9529 bool *attached)
534ca6d6
JA
9530{
9531 struct io_sq_data *sqd;
9532
26984fbf 9533 *attached = false;
5c2469e0
JA
9534 if (p->flags & IORING_SETUP_ATTACH_WQ) {
9535 sqd = io_attach_sq_data(p);
26984fbf
PB
9536 if (!IS_ERR(sqd)) {
9537 *attached = true;
5c2469e0 9538 return sqd;
26984fbf 9539 }
5c2469e0
JA
9540 /* fall through for EPERM case, setup new sqd/task */
9541 if (PTR_ERR(sqd) != -EPERM)
9542 return sqd;
9543 }
aa06165d 9544
534ca6d6
JA
9545 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
9546 if (!sqd)
9547 return ERR_PTR(-ENOMEM);
9548
9e138a48 9549 atomic_set(&sqd->park_pending, 0);
534ca6d6 9550 refcount_set(&sqd->refs, 1);
69fb2131 9551 INIT_LIST_HEAD(&sqd->ctx_list);
09a6f4ef 9552 mutex_init(&sqd->lock);
534ca6d6 9553 init_waitqueue_head(&sqd->wait);
37d1e2e3 9554 init_completion(&sqd->exited);
534ca6d6
JA
9555 return sqd;
9556}
9557
6b06314c
JA
9558/*
9559 * Ensure the UNIX gc is aware of our file set, so we are certain that
9560 * the io_uring can be safely unregistered on process exit, even if we have
1f59bc0f
PB
9561 * loops in the file referencing. We account only files that can hold other
9562 * files because otherwise they can't form a loop and so are not interesting
9563 * for GC.
6b06314c 9564 */
8b3171bd 9565static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
6b06314c 9566{
73b25d3b 9567#if defined(CONFIG_UNIX)
6b06314c 9568 struct sock *sk = ctx->ring_sock->sk;
73b25d3b 9569 struct sk_buff_head *head = &sk->sk_receive_queue;
6b06314c
JA
9570 struct scm_fp_list *fpl;
9571 struct sk_buff *skb;
6b06314c 9572
73b25d3b
PB
9573 if (likely(!io_file_need_scm(file)))
9574 return 0;
6b06314c 9575
73b25d3b
PB
9576 /*
9577 * See if we can merge this file into an existing skb SCM_RIGHTS
9578 * file set. If there's no room, fall back to allocating a new skb
9579 * and filling it in.
9580 */
9581 spin_lock_irq(&head->lock);
9582 skb = skb_peek(head);
9583 if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
9584 __skb_unlink(skb, head);
9585 else
9586 skb = NULL;
9587 spin_unlock_irq(&head->lock);
6b06314c 9588
6b06314c 9589 if (!skb) {
73b25d3b
PB
9590 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
9591 if (!fpl)
9592 return -ENOMEM;
65e19f54 9593
73b25d3b
PB
9594 skb = alloc_skb(0, GFP_KERNEL);
9595 if (!skb) {
9596 kfree(fpl);
9597 return -ENOMEM;
9598 }
6b06314c 9599
73b25d3b 9600 fpl->user = get_uid(current_user());
08a45173 9601 fpl->max = SCM_MAX_FD;
73b25d3b 9602 fpl->count = 0;
dca58c6a 9603
08a45173 9604 UNIXCB(skb).fp = fpl;
73b25d3b 9605 skb->sk = sk;
05f3fb3c 9606 skb->destructor = unix_destruct_scm;
08a45173 9607 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
6b06314c
JA
9608 }
9609
73b25d3b
PB
9610 fpl = UNIXCB(skb).fp;
9611 fpl->fp[fpl->count++] = get_file(file);
9612 unix_inflight(fpl->user, file);
9613 skb_queue_head(head, skb);
dca58c6a 9614 fput(file);
73b25d3b 9615#endif
6b06314c
JA
9616 return 0;
9617}
6b06314c 9618
47e90392 9619static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
05f3fb3c 9620{
50238531 9621 struct file *file = prsrc->file;
05f3fb3c
JA
9622#if defined(CONFIG_UNIX)
9623 struct sock *sock = ctx->ring_sock->sk;
9624 struct sk_buff_head list, *head = &sock->sk_receive_queue;
9625 struct sk_buff *skb;
9626 int i;
9627
1f59bc0f
PB
9628 if (!io_file_need_scm(file)) {
9629 fput(file);
9630 return;
9631 }
9632
05f3fb3c
JA
9633 __skb_queue_head_init(&list);
9634
9635 /*
9636 * Find the skb that holds this file in its SCM_RIGHTS. When found,
9637 * remove this entry and rearrange the file array.
9638 */
9639 skb = skb_dequeue(head);
9640 while (skb) {
9641 struct scm_fp_list *fp;
9642
9643 fp = UNIXCB(skb).fp;
9644 for (i = 0; i < fp->count; i++) {
9645 int left;
9646
9647 if (fp->fp[i] != file)
9648 continue;
9649
9650 unix_notinflight(fp->user, fp->fp[i]);
9651 left = fp->count - 1 - i;
9652 if (left) {
9653 memmove(&fp->fp[i], &fp->fp[i + 1],
9654 left * sizeof(struct file *));
9655 }
9656 fp->count--;
9657 if (!fp->count) {
9658 kfree_skb(skb);
9659 skb = NULL;
9660 } else {
9661 __skb_queue_tail(&list, skb);
9662 }
9663 fput(file);
9664 file = NULL;
9665 break;
9666 }
9667
9668 if (!file)
9669 break;
9670
9671 __skb_queue_tail(&list, skb);
9672
9673 skb = skb_dequeue(head);
9674 }
9675
9676 if (skb_peek(&list)) {
9677 spin_lock_irq(&head->lock);
9678 while ((skb = __skb_dequeue(&list)) != NULL)
9679 __skb_queue_tail(head, skb);
9680 spin_unlock_irq(&head->lock);
9681 }
9682#else
9683 fput(file);
9684#endif
9685}
9686
b895c9a6 9687static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
65e19f54 9688{
b895c9a6 9689 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
269bbe5f
BM
9690 struct io_ring_ctx *ctx = rsrc_data->ctx;
9691 struct io_rsrc_put *prsrc, *tmp;
05589553 9692
269bbe5f
BM
9693 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
9694 list_del(&prsrc->list);
b60c8dce
PB
9695
9696 if (prsrc->tag) {
f8929630
PB
9697 if (ctx->flags & IORING_SETUP_IOPOLL)
9698 mutex_lock(&ctx->uring_lock);
b60c8dce 9699
79ebeaee 9700 spin_lock(&ctx->completion_lock);
913a571a 9701 io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
b60c8dce 9702 io_commit_cqring(ctx);
79ebeaee 9703 spin_unlock(&ctx->completion_lock);
b60c8dce 9704 io_cqring_ev_posted(ctx);
f8929630
PB
9705
9706 if (ctx->flags & IORING_SETUP_IOPOLL)
9707 mutex_unlock(&ctx->uring_lock);
b60c8dce
PB
9708 }
9709
40ae0ff7 9710 rsrc_data->do_put(ctx, prsrc);
269bbe5f 9711 kfree(prsrc);
65e19f54 9712 }
05589553 9713
28a9fe25 9714 io_rsrc_node_destroy(ref_node);
3e942498
PB
9715 if (atomic_dec_and_test(&rsrc_data->refs))
9716 complete(&rsrc_data->done);
2faf852d 9717}
65e19f54 9718
269bbe5f 9719static void io_rsrc_put_work(struct work_struct *work)
4a38aed2
JA
9720{
9721 struct io_ring_ctx *ctx;
9722 struct llist_node *node;
9723
269bbe5f
BM
9724 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
9725 node = llist_del_all(&ctx->rsrc_put_llist);
4a38aed2
JA
9726
9727 while (node) {
b895c9a6 9728 struct io_rsrc_node *ref_node;
4a38aed2
JA
9729 struct llist_node *next = node->next;
9730
b895c9a6 9731 ref_node = llist_entry(node, struct io_rsrc_node, llist);
269bbe5f 9732 __io_rsrc_put_work(ref_node);
4a38aed2
JA
9733 node = next;
9734 }
9735}
9736
6b06314c 9737static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
792e3582 9738 unsigned nr_args, u64 __user *tags)
6b06314c
JA
9739{
9740 __s32 __user *fds = (__s32 __user *) arg;
05f3fb3c 9741 struct file *file;
f3baed39 9742 int fd, ret;
846a4ef2 9743 unsigned i;
6b06314c 9744
05f3fb3c 9745 if (ctx->file_data)
6b06314c
JA
9746 return -EBUSY;
9747 if (!nr_args)
9748 return -EINVAL;
9749 if (nr_args > IORING_MAX_FIXED_FILES)
9750 return -EMFILE;
3a1b8a4e
PB
9751 if (nr_args > rlimit(RLIMIT_NOFILE))
9752 return -EMFILE;
a7f0ed5a 9753 ret = io_rsrc_node_switch_start(ctx);
f3baed39
PB
9754 if (ret)
9755 return ret;
d878c816
PB
9756 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
9757 &ctx->file_data);
9758 if (ret)
9759 return ret;
6b06314c 9760
a03a2a20
PB
9761 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
9762 io_rsrc_data_free(ctx->file_data);
9763 ctx->file_data = NULL;
9764 return -ENOMEM;
9765 }
65e19f54 9766
08a45173 9767 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
a03a2a20
PB
9768 struct io_fixed_file *file_slot;
9769
d878c816 9770 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
600cf3f8 9771 ret = -EFAULT;
a03a2a20 9772 goto fail;
600cf3f8 9773 }
08a45173 9774 /* allow sparse sets */
792e3582
PB
9775 if (fd == -1) {
9776 ret = -EINVAL;
2d091d62 9777 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
a03a2a20 9778 goto fail;
08a45173 9779 continue;
792e3582 9780 }
6b06314c 9781
05f3fb3c 9782 file = fget(fd);
6b06314c 9783 ret = -EBADF;
792e3582 9784 if (unlikely(!file))
a03a2a20 9785 goto fail;
05f3fb3c 9786
6b06314c
JA
9787 /*
9788 * Don't allow io_uring instances to be registered. If UNIX
9789 * isn't enabled, then this causes a reference cycle and this
9790 * instance can never get freed. If UNIX is enabled we'll
9791 * handle it just fine, but there's still no point in allowing
9792 * a ring fd as it doesn't support regular read/write anyway.
9793 */
05f3fb3c
JA
9794 if (file->f_op == &io_uring_fops) {
9795 fput(file);
a03a2a20 9796 goto fail;
6b06314c 9797 }
8b3171bd 9798 ret = io_scm_file_account(ctx, file);
a03a2a20 9799 if (ret) {
600cf3f8 9800 fput(file);
a03a2a20 9801 goto fail;
c3a31e60 9802 }
e390510a
PB
9803 file_slot = io_fixed_file_slot(&ctx->file_table, i);
9804 io_fixed_file_set(file_slot, file);
c3a31e60
JA
9805 }
9806
a7f0ed5a 9807 io_rsrc_node_switch(ctx, NULL);
c3a31e60 9808 return 0;
a03a2a20
PB
9809fail:
9810 __io_sqe_files_unregister(ctx);
6b06314c 9811 return ret;
c3a31e60
JA
9812}
9813
9c7b0ba8
PB
9814static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
9815 struct io_rsrc_node *node, void *rsrc)
9816{
8f0a2480 9817 u64 *tag_slot = io_get_tag_slot(data, idx);
9c7b0ba8
PB
9818 struct io_rsrc_put *prsrc;
9819
9820 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
9821 if (!prsrc)
9822 return -ENOMEM;
9823
8f0a2480
PB
9824 prsrc->tag = *tag_slot;
9825 *tag_slot = 0;
9c7b0ba8
PB
9826 prsrc->rsrc = rsrc;
9827 list_add(&prsrc->list, &node->rsrc_list);
9828 return 0;
9829}
9830
b9445598
PB
9831static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
9832 unsigned int issue_flags, u32 slot_index)
9833{
9834 struct io_ring_ctx *ctx = req->ctx;
9c7b0ba8 9835 bool needs_switch = false;
b9445598
PB
9836 struct io_fixed_file *file_slot;
9837 int ret = -EBADF;
9838
f8929630 9839 io_ring_submit_lock(ctx, issue_flags);
b9445598
PB
9840 if (file->f_op == &io_uring_fops)
9841 goto err;
9842 ret = -ENXIO;
9843 if (!ctx->file_data)
9844 goto err;
9845 ret = -EINVAL;
9846 if (slot_index >= ctx->nr_user_files)
9847 goto err;
9848
9849 slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
9850 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
9c7b0ba8
PB
9851
9852 if (file_slot->file_ptr) {
9853 struct file *old_file;
9854
9855 ret = io_rsrc_node_switch_start(ctx);
9856 if (ret)
9857 goto err;
9858
9859 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
9860 ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
9861 ctx->rsrc_node, old_file);
9862 if (ret)
9863 goto err;
9864 file_slot->file_ptr = 0;
9865 needs_switch = true;
9866 }
b9445598 9867
8b3171bd 9868 ret = io_scm_file_account(ctx, file);
e390510a
PB
9869 if (!ret) {
9870 *io_get_tag_slot(ctx->file_data, slot_index) = 0;
9871 io_fixed_file_set(file_slot, file);
b9445598 9872 }
b9445598 9873err:
9c7b0ba8
PB
9874 if (needs_switch)
9875 io_rsrc_node_switch(ctx, ctx->file_data);
f8929630 9876 io_ring_submit_unlock(ctx, issue_flags);
b9445598
PB
9877 if (ret)
9878 fput(file);
9879 return ret;
9880}
9881
7df778be
PB
9882static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
9883{
9884 unsigned int offset = req->close.file_slot - 1;
9885 struct io_ring_ctx *ctx = req->ctx;
9886 struct io_fixed_file *file_slot;
9887 struct file *file;
4cdd158b 9888 int ret;
7df778be 9889
f8929630 9890 io_ring_submit_lock(ctx, issue_flags);
7df778be
PB
9891 ret = -ENXIO;
9892 if (unlikely(!ctx->file_data))
9893 goto out;
9894 ret = -EINVAL;
9895 if (offset >= ctx->nr_user_files)
9896 goto out;
9897 ret = io_rsrc_node_switch_start(ctx);
9898 if (ret)
9899 goto out;
9900
4cdd158b
PB
9901 offset = array_index_nospec(offset, ctx->nr_user_files);
9902 file_slot = io_fixed_file_slot(&ctx->file_table, offset);
7df778be
PB
9903 ret = -EBADF;
9904 if (!file_slot->file_ptr)
9905 goto out;
9906
9907 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
9908 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
9909 if (ret)
9910 goto out;
9911
9912 file_slot->file_ptr = 0;
9913 io_rsrc_node_switch(ctx, ctx->file_data);
9914 ret = 0;
9915out:
f8929630 9916 io_ring_submit_unlock(ctx, issue_flags);
7df778be
PB
9917 return ret;
9918}
9919
05f3fb3c 9920static int __io_sqe_files_update(struct io_ring_ctx *ctx,
c3bdad02 9921 struct io_uring_rsrc_update2 *up,
05f3fb3c
JA
9922 unsigned nr_args)
9923{
c3bdad02 9924 u64 __user *tags = u64_to_user_ptr(up->tags);
98f0b3b4 9925 __s32 __user *fds = u64_to_user_ptr(up->data);
b895c9a6 9926 struct io_rsrc_data *data = ctx->file_data;
a04b0ac0
PB
9927 struct io_fixed_file *file_slot;
9928 struct file *file;
98f0b3b4
PB
9929 int fd, i, err = 0;
9930 unsigned int done;
05589553 9931 bool needs_switch = false;
c3a31e60 9932
98f0b3b4
PB
9933 if (!ctx->file_data)
9934 return -ENXIO;
9935 if (up->offset + nr_args > ctx->nr_user_files)
c3a31e60
JA
9936 return -EINVAL;
9937
67973b93 9938 for (done = 0; done < nr_args; done++) {
c3bdad02
PB
9939 u64 tag = 0;
9940
9941 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
9942 copy_from_user(&fd, &fds[done], sizeof(fd))) {
c3a31e60
JA
9943 err = -EFAULT;
9944 break;
9945 }
c3bdad02
PB
9946 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
9947 err = -EINVAL;
9948 break;
9949 }
4e0377a1 9950 if (fd == IORING_REGISTER_FILES_SKIP)
9951 continue;
9952
67973b93 9953 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
aeca241b 9954 file_slot = io_fixed_file_slot(&ctx->file_table, i);
ea64ec02 9955
a04b0ac0
PB
9956 if (file_slot->file_ptr) {
9957 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
4cdd158b 9958 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
a5318d3c
HD
9959 if (err)
9960 break;
a04b0ac0 9961 file_slot->file_ptr = 0;
05589553 9962 needs_switch = true;
c3a31e60
JA
9963 }
9964 if (fd != -1) {
c3a31e60
JA
9965 file = fget(fd);
9966 if (!file) {
9967 err = -EBADF;
9968 break;
9969 }
9970 /*
9971 * Don't allow io_uring instances to be registered. If
9972 * UNIX isn't enabled, then this causes a reference
9973 * cycle and this instance can never get freed. If UNIX
9974 * is enabled we'll handle it just fine, but there's
9975 * still no point in allowing a ring fd as it doesn't
9976 * support regular read/write anyway.
9977 */
9978 if (file->f_op == &io_uring_fops) {
9979 fput(file);
9980 err = -EBADF;
9981 break;
9982 }
8b3171bd 9983 err = io_scm_file_account(ctx, file);
f3bd9dae
YY
9984 if (err) {
9985 fput(file);
c3a31e60 9986 break;
f3bd9dae 9987 }
e390510a
PB
9988 *io_get_tag_slot(data, i) = tag;
9989 io_fixed_file_set(file_slot, file);
c3a31e60 9990 }
05f3fb3c
JA
9991 }
9992
a7f0ed5a
PB
9993 if (needs_switch)
9994 io_rsrc_node_switch(ctx, data);
c3a31e60
JA
9995 return done ? done : err;
9996}
05589553 9997
685fe7fe
JA
9998static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
9999 struct task_struct *task)
24369c2e 10000{
e941894e 10001 struct io_wq_hash *hash;
24369c2e 10002 struct io_wq_data data;
24369c2e 10003 unsigned int concurrency;
24369c2e 10004
362a9e65 10005 mutex_lock(&ctx->uring_lock);
e941894e
JA
10006 hash = ctx->hash_map;
10007 if (!hash) {
10008 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
362a9e65
YY
10009 if (!hash) {
10010 mutex_unlock(&ctx->uring_lock);
e941894e 10011 return ERR_PTR(-ENOMEM);
362a9e65 10012 }
e941894e
JA
10013 refcount_set(&hash->refs, 1);
10014 init_waitqueue_head(&hash->wait);
10015 ctx->hash_map = hash;
24369c2e 10016 }
362a9e65 10017 mutex_unlock(&ctx->uring_lock);
24369c2e 10018
e941894e 10019 data.hash = hash;
685fe7fe 10020 data.task = task;
ebc11b6c 10021 data.free_work = io_wq_free_work;
f5fa38c5 10022 data.do_work = io_wq_submit_work;
24369c2e 10023
d25e3a3d
JA
10024 /* Do QD, or 4 * CPUS, whatever is smallest */
10025 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
24369c2e 10026
5aa75ed5 10027 return io_wq_create(concurrency, &data);
24369c2e
PB
10028}
10029
c072481d
PB
10030static __cold int io_uring_alloc_task_context(struct task_struct *task,
10031 struct io_ring_ctx *ctx)
0f212204
JA
10032{
10033 struct io_uring_task *tctx;
d8a6df10 10034 int ret;
0f212204 10035
09899b19 10036 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
0f212204
JA
10037 if (unlikely(!tctx))
10038 return -ENOMEM;
10039
e7a6c00d
JA
10040 tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
10041 sizeof(struct file *), GFP_KERNEL);
10042 if (unlikely(!tctx->registered_rings)) {
10043 kfree(tctx);
10044 return -ENOMEM;
10045 }
10046
d8a6df10
JA
10047 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
10048 if (unlikely(ret)) {
e7a6c00d 10049 kfree(tctx->registered_rings);
d8a6df10
JA
10050 kfree(tctx);
10051 return ret;
10052 }
10053
685fe7fe 10054 tctx->io_wq = io_init_wq_offload(ctx, task);
5aa75ed5
JA
10055 if (IS_ERR(tctx->io_wq)) {
10056 ret = PTR_ERR(tctx->io_wq);
10057 percpu_counter_destroy(&tctx->inflight);
e7a6c00d 10058 kfree(tctx->registered_rings);
5aa75ed5
JA
10059 kfree(tctx);
10060 return ret;
10061 }
10062
0f212204
JA
10063 xa_init(&tctx->xa);
10064 init_waitqueue_head(&tctx->wait);
fdaf083c 10065 atomic_set(&tctx->in_idle, 0);
0f212204 10066 task->io_uring = tctx;
7cbf1722
JA
10067 spin_lock_init(&tctx->task_lock);
10068 INIT_WQ_LIST(&tctx->task_list);
4813c377 10069 INIT_WQ_LIST(&tctx->prior_task_list);
7cbf1722 10070 init_task_work(&tctx->task_work, tctx_task_work);
0f212204
JA
10071 return 0;
10072}
10073
10074void __io_uring_free(struct task_struct *tsk)
10075{
10076 struct io_uring_task *tctx = tsk->io_uring;
10077
10078 WARN_ON_ONCE(!xa_empty(&tctx->xa));
ef8eaa4e 10079 WARN_ON_ONCE(tctx->io_wq);
09899b19 10080 WARN_ON_ONCE(tctx->cached_refs);
ef8eaa4e 10081
e7a6c00d 10082 kfree(tctx->registered_rings);
d8a6df10 10083 percpu_counter_destroy(&tctx->inflight);
0f212204
JA
10084 kfree(tctx);
10085 tsk->io_uring = NULL;
10086}
10087
c072481d
PB
10088static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
10089 struct io_uring_params *p)
2b188cc1
JA
10090{
10091 int ret;
10092
d25e3a3d
JA
10093 /* Retain compatibility with failing for an invalid attach attempt */
10094 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
10095 IORING_SETUP_ATTACH_WQ) {
10096 struct fd f;
10097
10098 f = fdget(p->wq_fd);
10099 if (!f.file)
10100 return -ENXIO;
0cc936f7
JA
10101 if (f.file->f_op != &io_uring_fops) {
10102 fdput(f);
f2a48dd0 10103 return -EINVAL;
0cc936f7
JA
10104 }
10105 fdput(f);
d25e3a3d 10106 }
6c271ce2 10107 if (ctx->flags & IORING_SETUP_SQPOLL) {
46fe18b1 10108 struct task_struct *tsk;
534ca6d6 10109 struct io_sq_data *sqd;
26984fbf 10110 bool attached;
534ca6d6 10111
cdc1404a
PM
10112 ret = security_uring_sqpoll();
10113 if (ret)
10114 return ret;
10115
26984fbf 10116 sqd = io_get_sq_data(p, &attached);
534ca6d6
JA
10117 if (IS_ERR(sqd)) {
10118 ret = PTR_ERR(sqd);
10119 goto err;
10120 }
69fb2131 10121
7c30f36a 10122 ctx->sq_creds = get_current_cred();
534ca6d6 10123 ctx->sq_data = sqd;
917257da
JA
10124 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
10125 if (!ctx->sq_thread_idle)
10126 ctx->sq_thread_idle = HZ;
10127
78d7f6ba 10128 io_sq_thread_park(sqd);
de75a3d3
PB
10129 list_add(&ctx->sqd_list, &sqd->ctx_list);
10130 io_sqd_update_thread_idle(sqd);
26984fbf 10131 /* don't attach to a dying SQPOLL thread, would be racy */
f2a48dd0 10132 ret = (attached && !sqd->thread) ? -ENXIO : 0;
78d7f6ba
PB
10133 io_sq_thread_unpark(sqd);
10134
de75a3d3
PB
10135 if (ret < 0)
10136 goto err;
10137 if (attached)
5aa75ed5 10138 return 0;
aa06165d 10139
6c271ce2 10140 if (p->flags & IORING_SETUP_SQ_AFF) {
44a9bd18 10141 int cpu = p->sq_thread_cpu;
6c271ce2 10142
917257da 10143 ret = -EINVAL;
f2a48dd0 10144 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
e8f98f24 10145 goto err_sqpoll;
37d1e2e3 10146 sqd->sq_cpu = cpu;
6c271ce2 10147 } else {
37d1e2e3 10148 sqd->sq_cpu = -1;
6c271ce2 10149 }
37d1e2e3
JA
10150
10151 sqd->task_pid = current->pid;
5c2469e0 10152 sqd->task_tgid = current->tgid;
46fe18b1
JA
10153 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
10154 if (IS_ERR(tsk)) {
10155 ret = PTR_ERR(tsk);
e8f98f24 10156 goto err_sqpoll;
6c271ce2 10157 }
97a73a0f 10158
46fe18b1 10159 sqd->thread = tsk;
97a73a0f 10160 ret = io_uring_alloc_task_context(tsk, ctx);
46fe18b1 10161 wake_up_new_task(tsk);
0f212204
JA
10162 if (ret)
10163 goto err;
6c271ce2
JA
10164 } else if (p->flags & IORING_SETUP_SQ_AFF) {
10165 /* Can't have SQ_AFF without SQPOLL */
10166 ret = -EINVAL;
10167 goto err;
10168 }
10169
2b188cc1 10170 return 0;
f2a48dd0
PB
10171err_sqpoll:
10172 complete(&ctx->sq_data->exited);
2b188cc1 10173err:
37d1e2e3 10174 io_sq_thread_finish(ctx);
2b188cc1
JA
10175 return ret;
10176}
10177
a087e2b5
BM
10178static inline void __io_unaccount_mem(struct user_struct *user,
10179 unsigned long nr_pages)
2b188cc1
JA
10180{
10181 atomic_long_sub(nr_pages, &user->locked_vm);
10182}
10183
a087e2b5
BM
10184static inline int __io_account_mem(struct user_struct *user,
10185 unsigned long nr_pages)
2b188cc1
JA
10186{
10187 unsigned long page_limit, cur_pages, new_pages;
10188
10189 /* Don't allow more pages than we can safely lock */
10190 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
10191
10192 do {
10193 cur_pages = atomic_long_read(&user->locked_vm);
10194 new_pages = cur_pages + nr_pages;
10195 if (new_pages > page_limit)
10196 return -ENOMEM;
10197 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
10198 new_pages) != cur_pages);
10199
10200 return 0;
10201}
10202
26bfa89e 10203static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 10204{
62e398be 10205 if (ctx->user)
a087e2b5 10206 __io_unaccount_mem(ctx->user, nr_pages);
30975825 10207
26bfa89e
JA
10208 if (ctx->mm_account)
10209 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
10210}
10211
26bfa89e 10212static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
a087e2b5 10213{
30975825
BM
10214 int ret;
10215
62e398be 10216 if (ctx->user) {
30975825
BM
10217 ret = __io_account_mem(ctx->user, nr_pages);
10218 if (ret)
10219 return ret;
10220 }
10221
26bfa89e
JA
10222 if (ctx->mm_account)
10223 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
a087e2b5
BM
10224
10225 return 0;
10226}
10227
2b188cc1
JA
10228static void io_mem_free(void *ptr)
10229{
52e04ef4
MR
10230 struct page *page;
10231
10232 if (!ptr)
10233 return;
2b188cc1 10234
52e04ef4 10235 page = virt_to_head_page(ptr);
2b188cc1
JA
10236 if (put_page_testzero(page))
10237 free_compound_page(page);
10238}
10239
10240static void *io_mem_alloc(size_t size)
10241{
0a3f1e0b 10242 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
2b188cc1 10243
0a3f1e0b 10244 return (void *) __get_free_pages(gfp, get_order(size));
2b188cc1
JA
10245}
10246
baf9cb64
SR
10247static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
10248 unsigned int cq_entries, size_t *sq_offset)
75b28aff
HV
10249{
10250 struct io_rings *rings;
10251 size_t off, sq_array_size;
10252
10253 off = struct_size(rings, cqes, cq_entries);
10254 if (off == SIZE_MAX)
10255 return SIZE_MAX;
baf9cb64
SR
10256 if (ctx->flags & IORING_SETUP_CQE32) {
10257 if (check_shl_overflow(off, 1, &off))
10258 return SIZE_MAX;
10259 }
75b28aff
HV
10260
10261#ifdef CONFIG_SMP
10262 off = ALIGN(off, SMP_CACHE_BYTES);
10263 if (off == 0)
10264 return SIZE_MAX;
10265#endif
10266
b36200f5
DV
10267 if (sq_offset)
10268 *sq_offset = off;
10269
75b28aff
HV
10270 sq_array_size = array_size(sizeof(u32), sq_entries);
10271 if (sq_array_size == SIZE_MAX)
10272 return SIZE_MAX;
10273
10274 if (check_add_overflow(off, sq_array_size, &off))
10275 return SIZE_MAX;
10276
75b28aff
HV
10277 return off;
10278}
10279
41edf1a5 10280static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
7f61a1e9 10281{
41edf1a5 10282 struct io_mapped_ubuf *imu = *slot;
7f61a1e9
PB
10283 unsigned int i;
10284
6224843d
PB
10285 if (imu != ctx->dummy_ubuf) {
10286 for (i = 0; i < imu->nr_bvecs; i++)
10287 unpin_user_page(imu->bvec[i].bv_page);
10288 if (imu->acct_pages)
10289 io_unaccount_mem(ctx, imu->acct_pages);
10290 kvfree(imu);
10291 }
41edf1a5 10292 *slot = NULL;
7f61a1e9
PB
10293}
10294
bd54b6fe 10295static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
edafccee 10296{
634d00df
PB
10297 io_buffer_unmap(ctx, &prsrc->buf);
10298 prsrc->buf = NULL;
bd54b6fe 10299}
edafccee 10300
bd54b6fe
BM
10301static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
10302{
10303 unsigned int i;
edafccee 10304
7f61a1e9
PB
10305 for (i = 0; i < ctx->nr_user_bufs; i++)
10306 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
edafccee 10307 kfree(ctx->user_bufs);
bb6659cc 10308 io_rsrc_data_free(ctx->buf_data);
edafccee 10309 ctx->user_bufs = NULL;
bd54b6fe 10310 ctx->buf_data = NULL;
edafccee 10311 ctx->nr_user_bufs = 0;
bd54b6fe
BM
10312}
10313
0a96bbe4 10314static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
edafccee 10315{
bd54b6fe 10316 int ret;
edafccee 10317
bd54b6fe 10318 if (!ctx->buf_data)
edafccee
JA
10319 return -ENXIO;
10320
bd54b6fe
BM
10321 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
10322 if (!ret)
10323 __io_sqe_buffers_unregister(ctx);
10324 return ret;
edafccee
JA
10325}
10326
10327static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
10328 void __user *arg, unsigned index)
10329{
10330 struct iovec __user *src;
10331
10332#ifdef CONFIG_COMPAT
10333 if (ctx->compat) {
10334 struct compat_iovec __user *ciovs;
10335 struct compat_iovec ciov;
10336
10337 ciovs = (struct compat_iovec __user *) arg;
10338 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
10339 return -EFAULT;
10340
d55e5f5b 10341 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
edafccee
JA
10342 dst->iov_len = ciov.iov_len;
10343 return 0;
10344 }
10345#endif
10346 src = (struct iovec __user *) arg;
10347 if (copy_from_user(dst, &src[index], sizeof(*dst)))
10348 return -EFAULT;
10349 return 0;
10350}
10351
de293938
JA
10352/*
10353 * Not super efficient, but this is just a registration time. And we do cache
10354 * the last compound head, so generally we'll only do a full search if we don't
10355 * match that one.
10356 *
10357 * We check if the given compound head page has already been accounted, to
10358 * avoid double accounting it. This allows us to account the full size of the
10359 * page, not just the constituent pages of a huge page.
10360 */
10361static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
10362 int nr_pages, struct page *hpage)
10363{
10364 int i, j;
10365
10366 /* check current page array */
10367 for (i = 0; i < nr_pages; i++) {
10368 if (!PageCompound(pages[i]))
10369 continue;
10370 if (compound_head(pages[i]) == hpage)
10371 return true;
10372 }
10373
10374 /* check previously registered pages */
10375 for (i = 0; i < ctx->nr_user_bufs; i++) {
41edf1a5 10376 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
de293938
JA
10377
10378 for (j = 0; j < imu->nr_bvecs; j++) {
10379 if (!PageCompound(imu->bvec[j].bv_page))
10380 continue;
10381 if (compound_head(imu->bvec[j].bv_page) == hpage)
10382 return true;
10383 }
10384 }
10385
10386 return false;
10387}
10388
10389static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
10390 int nr_pages, struct io_mapped_ubuf *imu,
10391 struct page **last_hpage)
10392{
10393 int i, ret;
10394
216e5835 10395 imu->acct_pages = 0;
de293938
JA
10396 for (i = 0; i < nr_pages; i++) {
10397 if (!PageCompound(pages[i])) {
10398 imu->acct_pages++;
10399 } else {
10400 struct page *hpage;
10401
10402 hpage = compound_head(pages[i]);
10403 if (hpage == *last_hpage)
10404 continue;
10405 *last_hpage = hpage;
10406 if (headpage_already_acct(ctx, pages, i, hpage))
10407 continue;
10408 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
10409 }
10410 }
10411
10412 if (!imu->acct_pages)
10413 return 0;
10414
26bfa89e 10415 ret = io_account_mem(ctx, imu->acct_pages);
de293938
JA
10416 if (ret)
10417 imu->acct_pages = 0;
10418 return ret;
10419}
10420
0a96bbe4 10421static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
41edf1a5 10422 struct io_mapped_ubuf **pimu,
0a96bbe4 10423 struct page **last_hpage)
edafccee 10424{
41edf1a5 10425 struct io_mapped_ubuf *imu = NULL;
edafccee
JA
10426 struct vm_area_struct **vmas = NULL;
10427 struct page **pages = NULL;
0a96bbe4
BM
10428 unsigned long off, start, end, ubuf;
10429 size_t size;
10430 int ret, pret, nr_pages, i;
10431
6224843d
PB
10432 if (!iov->iov_base) {
10433 *pimu = ctx->dummy_ubuf;
10434 return 0;
10435 }
10436
0a96bbe4
BM
10437 ubuf = (unsigned long) iov->iov_base;
10438 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
10439 start = ubuf >> PAGE_SHIFT;
10440 nr_pages = end - start;
10441
41edf1a5 10442 *pimu = NULL;
0a96bbe4
BM
10443 ret = -ENOMEM;
10444
10445 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
10446 if (!pages)
10447 goto done;
10448
10449 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
10450 GFP_KERNEL);
10451 if (!vmas)
10452 goto done;
edafccee 10453
41edf1a5 10454 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
a2b4198c 10455 if (!imu)
0a96bbe4
BM
10456 goto done;
10457
10458 ret = 0;
10459 mmap_read_lock(current->mm);
10460 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
10461 pages, vmas);
10462 if (pret == nr_pages) {
10463 /* don't support file backed memory */
10464 for (i = 0; i < nr_pages; i++) {
10465 struct vm_area_struct *vma = vmas[i];
10466
40dad765
PB
10467 if (vma_is_shmem(vma))
10468 continue;
0a96bbe4
BM
10469 if (vma->vm_file &&
10470 !is_file_hugepages(vma->vm_file)) {
10471 ret = -EOPNOTSUPP;
10472 break;
10473 }
10474 }
10475 } else {
10476 ret = pret < 0 ? pret : -EFAULT;
10477 }
10478 mmap_read_unlock(current->mm);
10479 if (ret) {
10480 /*
10481 * if we did partial map, or found file backed vmas,
10482 * release any pages we did get
10483 */
10484 if (pret > 0)
10485 unpin_user_pages(pages, pret);
0a96bbe4
BM
10486 goto done;
10487 }
10488
10489 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
10490 if (ret) {
10491 unpin_user_pages(pages, pret);
0a96bbe4
BM
10492 goto done;
10493 }
10494
10495 off = ubuf & ~PAGE_MASK;
10496 size = iov->iov_len;
10497 for (i = 0; i < nr_pages; i++) {
10498 size_t vec_len;
10499
10500 vec_len = min_t(size_t, size, PAGE_SIZE - off);
10501 imu->bvec[i].bv_page = pages[i];
10502 imu->bvec[i].bv_len = vec_len;
10503 imu->bvec[i].bv_offset = off;
10504 off = 0;
10505 size -= vec_len;
10506 }
10507 /* store original address for later verification */
10508 imu->ubuf = ubuf;
4751f53d 10509 imu->ubuf_end = ubuf + iov->iov_len;
0a96bbe4 10510 imu->nr_bvecs = nr_pages;
41edf1a5 10511 *pimu = imu;
0a96bbe4
BM
10512 ret = 0;
10513done:
41edf1a5
PB
10514 if (ret)
10515 kvfree(imu);
0a96bbe4
BM
10516 kvfree(pages);
10517 kvfree(vmas);
10518 return ret;
10519}
10520
2b358604 10521static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
0a96bbe4 10522{
87094465
PB
10523 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
10524 return ctx->user_bufs ? 0 : -ENOMEM;
2b358604 10525}
edafccee 10526
2b358604
BM
10527static int io_buffer_validate(struct iovec *iov)
10528{
50e96989
PB
10529 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
10530
2b358604
BM
10531 /*
10532 * Don't impose further limits on the size and buffer
10533 * constraints here, we'll -EINVAL later when IO is
10534 * submitted if they are wrong.
10535 */
6224843d
PB
10536 if (!iov->iov_base)
10537 return iov->iov_len ? -EFAULT : 0;
10538 if (!iov->iov_len)
2b358604 10539 return -EFAULT;
edafccee 10540
2b358604
BM
10541 /* arbitrary limit, but we need something */
10542 if (iov->iov_len > SZ_1G)
10543 return -EFAULT;
edafccee 10544
50e96989
PB
10545 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
10546 return -EOVERFLOW;
10547
2b358604
BM
10548 return 0;
10549}
edafccee 10550
2b358604 10551static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
634d00df 10552 unsigned int nr_args, u64 __user *tags)
2b358604 10553{
bd54b6fe
BM
10554 struct page *last_hpage = NULL;
10555 struct io_rsrc_data *data;
2b358604
BM
10556 int i, ret;
10557 struct iovec iov;
edafccee 10558
87094465
PB
10559 if (ctx->user_bufs)
10560 return -EBUSY;
489809e2 10561 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
87094465 10562 return -EINVAL;
bd54b6fe 10563 ret = io_rsrc_node_switch_start(ctx);
2b358604
BM
10564 if (ret)
10565 return ret;
d878c816
PB
10566 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
10567 if (ret)
10568 return ret;
bd54b6fe
BM
10569 ret = io_buffers_map_alloc(ctx, nr_args);
10570 if (ret) {
bb6659cc 10571 io_rsrc_data_free(data);
bd54b6fe
BM
10572 return ret;
10573 }
edafccee 10574
87094465 10575 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
edafccee
JA
10576 ret = io_copy_iov(ctx, &iov, arg, i);
10577 if (ret)
0a96bbe4 10578 break;
2b358604
BM
10579 ret = io_buffer_validate(&iov);
10580 if (ret)
0a96bbe4 10581 break;
2d091d62 10582 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
cf3770e7
CIK
10583 ret = -EINVAL;
10584 break;
10585 }
edafccee 10586
41edf1a5
PB
10587 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
10588 &last_hpage);
0a96bbe4
BM
10589 if (ret)
10590 break;
edafccee 10591 }
0a96bbe4 10592
bd54b6fe 10593 WARN_ON_ONCE(ctx->buf_data);
0a96bbe4 10594
bd54b6fe
BM
10595 ctx->buf_data = data;
10596 if (ret)
10597 __io_sqe_buffers_unregister(ctx);
10598 else
10599 io_rsrc_node_switch(ctx, NULL);
edafccee
JA
10600 return ret;
10601}
10602
634d00df
PB
10603static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
10604 struct io_uring_rsrc_update2 *up,
10605 unsigned int nr_args)
10606{
10607 u64 __user *tags = u64_to_user_ptr(up->tags);
10608 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
634d00df
PB
10609 struct page *last_hpage = NULL;
10610 bool needs_switch = false;
10611 __u32 done;
10612 int i, err;
10613
10614 if (!ctx->buf_data)
10615 return -ENXIO;
10616 if (up->offset + nr_args > ctx->nr_user_bufs)
10617 return -EINVAL;
10618
10619 for (done = 0; done < nr_args; done++) {
0b8c0e7c
PB
10620 struct io_mapped_ubuf *imu;
10621 int offset = up->offset + done;
634d00df
PB
10622 u64 tag = 0;
10623
10624 err = io_copy_iov(ctx, &iov, iovs, done);
10625 if (err)
10626 break;
10627 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
10628 err = -EFAULT;
10629 break;
10630 }
0b8c0e7c
PB
10631 err = io_buffer_validate(&iov);
10632 if (err)
10633 break;
cf3770e7
CIK
10634 if (!iov.iov_base && tag) {
10635 err = -EINVAL;
10636 break;
10637 }
0b8c0e7c
PB
10638 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
10639 if (err)
10640 break;
634d00df 10641
0b8c0e7c 10642 i = array_index_nospec(offset, ctx->nr_user_bufs);
6224843d 10643 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
4cdd158b 10644 err = io_queue_rsrc_removal(ctx->buf_data, i,
0b8c0e7c
PB
10645 ctx->rsrc_node, ctx->user_bufs[i]);
10646 if (unlikely(err)) {
10647 io_buffer_unmap(ctx, &imu);
634d00df 10648 break;
0b8c0e7c 10649 }
634d00df
PB
10650 ctx->user_bufs[i] = NULL;
10651 needs_switch = true;
10652 }
10653
0b8c0e7c 10654 ctx->user_bufs[i] = imu;
2d091d62 10655 *io_get_tag_slot(ctx->buf_data, offset) = tag;
634d00df
PB
10656 }
10657
10658 if (needs_switch)
10659 io_rsrc_node_switch(ctx, ctx->buf_data);
10660 return done ? done : err;
10661}
10662
c75312dd
UA
10663static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
10664 unsigned int eventfd_async)
9b402849 10665{
77bc59b4 10666 struct io_ev_fd *ev_fd;
9b402849 10667 __s32 __user *fds = arg;
f0a4e62b 10668 int fd;
9b402849 10669
77bc59b4
UA
10670 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
10671 lockdep_is_held(&ctx->uring_lock));
10672 if (ev_fd)
9b402849
JA
10673 return -EBUSY;
10674
10675 if (copy_from_user(&fd, fds, sizeof(*fds)))
10676 return -EFAULT;
10677
77bc59b4
UA
10678 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
10679 if (!ev_fd)
10680 return -ENOMEM;
fe7e3257 10681
77bc59b4
UA
10682 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
10683 if (IS_ERR(ev_fd->cq_ev_fd)) {
f0a4e62b 10684 int ret = PTR_ERR(ev_fd->cq_ev_fd);
77bc59b4 10685 kfree(ev_fd);
9b402849
JA
10686 return ret;
10687 }
c75312dd 10688 ev_fd->eventfd_async = eventfd_async;
9aa8dfde 10689 ctx->has_evfd = true;
77bc59b4 10690 rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
f0a4e62b 10691 return 0;
77bc59b4
UA
10692}
10693
10694static void io_eventfd_put(struct rcu_head *rcu)
10695{
10696 struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
10697
10698 eventfd_ctx_put(ev_fd->cq_ev_fd);
10699 kfree(ev_fd);
9b402849
JA
10700}
10701
10702static int io_eventfd_unregister(struct io_ring_ctx *ctx)
10703{
77bc59b4
UA
10704 struct io_ev_fd *ev_fd;
10705
10706 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
10707 lockdep_is_held(&ctx->uring_lock));
10708 if (ev_fd) {
9aa8dfde 10709 ctx->has_evfd = false;
77bc59b4
UA
10710 rcu_assign_pointer(ctx->io_ev_fd, NULL);
10711 call_rcu(&ev_fd->rcu, io_eventfd_put);
9b402849
JA
10712 return 0;
10713 }
10714
10715 return -ENXIO;
10716}
10717
5a2e745d
JA
10718static void io_destroy_buffers(struct io_ring_ctx *ctx)
10719{
9cfc7e94
JA
10720 struct io_buffer_list *bl;
10721 unsigned long index;
dbc7d452
JA
10722 int i;
10723
9cfc7e94
JA
10724 for (i = 0; i < BGID_ARRAY; i++) {
10725 if (!ctx->io_bl)
10726 break;
10727 __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
10728 }
dbc7d452 10729
9cfc7e94
JA
10730 xa_for_each(&ctx->io_bl_xa, index, bl) {
10731 xa_erase(&ctx->io_bl_xa, bl->bgid);
10732 __io_remove_buffers(ctx, bl, -1U);
dbc7d452 10733 }
cc3cec83
JA
10734
10735 while (!list_empty(&ctx->io_buffers_pages)) {
10736 struct page *page;
10737
10738 page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
10739 list_del_init(&page->lru);
10740 __free_page(page);
10741 }
5a2e745d
JA
10742}
10743
4010fec4 10744static void io_req_caches_free(struct io_ring_ctx *ctx)
2b188cc1 10745{
cd0ca2e0 10746 struct io_submit_state *state = &ctx->submit_state;
37f0e767 10747 int nr = 0;
bf019da7 10748
9a4fdbd8 10749 mutex_lock(&ctx->uring_lock);
cd0ca2e0 10750 io_flush_cached_locked_reqs(ctx, state);
9a4fdbd8 10751
88ab95be 10752 while (!io_req_cache_empty(ctx)) {
c2b6c6bc
PB
10753 struct io_wq_work_node *node;
10754 struct io_kiocb *req;
9a4fdbd8 10755
c2b6c6bc
PB
10756 node = wq_stack_extract(&state->free_list);
10757 req = container_of(node, struct io_kiocb, comp_list);
10758 kmem_cache_free(req_cachep, req);
37f0e767 10759 nr++;
c2b6c6bc 10760 }
37f0e767
PB
10761 if (nr)
10762 percpu_ref_put_many(&ctx->refs, nr);
9a4fdbd8
JA
10763 mutex_unlock(&ctx->uring_lock);
10764}
10765
43597aac 10766static void io_wait_rsrc_data(struct io_rsrc_data *data)
2b188cc1 10767{
43597aac 10768 if (data && !atomic_dec_and_test(&data->refs))
bd54b6fe 10769 wait_for_completion(&data->done);
bd54b6fe 10770}
04fc6c80 10771
4d9237e3
JA
10772static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
10773{
10774 struct async_poll *apoll;
10775
10776 while (!list_empty(&ctx->apoll_cache)) {
10777 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
10778 poll.wait.entry);
10779 list_del(&apoll->poll.wait.entry);
10780 kfree(apoll);
10781 }
10782}
10783
c072481d 10784static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
2b188cc1 10785{
37d1e2e3 10786 io_sq_thread_finish(ctx);
2aede0e4 10787
37d1e2e3 10788 if (ctx->mm_account) {
2aede0e4
JA
10789 mmdrop(ctx->mm_account);
10790 ctx->mm_account = NULL;
30975825 10791 }
def596e9 10792
ab409402 10793 io_rsrc_refs_drop(ctx);
43597aac
PB
10794 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
10795 io_wait_rsrc_data(ctx->buf_data);
10796 io_wait_rsrc_data(ctx->file_data);
10797
8bad28d8 10798 mutex_lock(&ctx->uring_lock);
43597aac 10799 if (ctx->buf_data)
bd54b6fe 10800 __io_sqe_buffers_unregister(ctx);
43597aac 10801 if (ctx->file_data)
08480400 10802 __io_sqe_files_unregister(ctx);
c4ea060e
PB
10803 if (ctx->rings)
10804 __io_cqring_overflow_flush(ctx, true);
9b402849 10805 io_eventfd_unregister(ctx);
4d9237e3 10806 io_flush_apoll_cache(ctx);
77bc59b4 10807 mutex_unlock(&ctx->uring_lock);
5a2e745d 10808 io_destroy_buffers(ctx);
07db298a
PB
10809 if (ctx->sq_creds)
10810 put_cred(ctx->sq_creds);
def596e9 10811
a7f0ed5a
PB
10812 /* there are no registered resources left, nobody uses it */
10813 if (ctx->rsrc_node)
10814 io_rsrc_node_destroy(ctx->rsrc_node);
8dd03afe 10815 if (ctx->rsrc_backup_node)
b895c9a6 10816 io_rsrc_node_destroy(ctx->rsrc_backup_node);
a7f0ed5a 10817 flush_delayed_work(&ctx->rsrc_put_work);
756ab7c0 10818 flush_delayed_work(&ctx->fallback_work);
a7f0ed5a
PB
10819
10820 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
10821 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
def596e9 10822
2b188cc1 10823#if defined(CONFIG_UNIX)
355e8d26
EB
10824 if (ctx->ring_sock) {
10825 ctx->ring_sock->file = NULL; /* so that iput() is called */
2b188cc1 10826 sock_release(ctx->ring_sock);
355e8d26 10827 }
2b188cc1 10828#endif
ef9dd637 10829 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
2b188cc1 10830
75b28aff 10831 io_mem_free(ctx->rings);
2b188cc1 10832 io_mem_free(ctx->sq_sqes);
2b188cc1
JA
10833
10834 percpu_ref_exit(&ctx->refs);
2b188cc1 10835 free_uid(ctx->user);
4010fec4 10836 io_req_caches_free(ctx);
e941894e
JA
10837 if (ctx->hash_map)
10838 io_wq_put_hash(ctx->hash_map);
78076bb6 10839 kfree(ctx->cancel_hash);
6224843d 10840 kfree(ctx->dummy_ubuf);
9cfc7e94
JA
10841 kfree(ctx->io_bl);
10842 xa_destroy(&ctx->io_bl_xa);
2b188cc1
JA
10843 kfree(ctx);
10844}
10845
10846static __poll_t io_uring_poll(struct file *file, poll_table *wait)
10847{
10848 struct io_ring_ctx *ctx = file->private_data;
10849 __poll_t mask = 0;
10850
d60aa65b 10851 poll_wait(file, &ctx->cq_wait, wait);
4f7067c3
SB
10852 /*
10853 * synchronizes with barrier from wq_has_sleeper call in
10854 * io_commit_cqring
10855 */
2b188cc1 10856 smp_rmb();
90554200 10857 if (!io_sqring_full(ctx))
2b188cc1 10858 mask |= EPOLLOUT | EPOLLWRNORM;
ed670c3f
HX
10859
10860 /*
10861 * Don't flush cqring overflow list here, just do a simple check.
10862 * Otherwise there could possible be ABBA deadlock:
10863 * CPU0 CPU1
10864 * ---- ----
10865 * lock(&ctx->uring_lock);
10866 * lock(&ep->mtx);
10867 * lock(&ctx->uring_lock);
10868 * lock(&ep->mtx);
10869 *
10870 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
10871 * pushs them to do the flush.
10872 */
10988a0a
DY
10873 if (io_cqring_events(ctx) ||
10874 test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
2b188cc1
JA
10875 mask |= EPOLLIN | EPOLLRDNORM;
10876
10877 return mask;
10878}
10879
0bead8cd 10880static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
071698e1 10881{
4379bf8b 10882 const struct cred *creds;
071698e1 10883
61cf9370 10884 creds = xa_erase(&ctx->personalities, id);
4379bf8b
JA
10885 if (creds) {
10886 put_cred(creds);
0bead8cd 10887 return 0;
1e6fa521 10888 }
0bead8cd
YD
10889
10890 return -EINVAL;
10891}
10892
d56d938b
PB
10893struct io_tctx_exit {
10894 struct callback_head task_work;
10895 struct completion completion;
baf186c4 10896 struct io_ring_ctx *ctx;
d56d938b
PB
10897};
10898
c072481d 10899static __cold void io_tctx_exit_cb(struct callback_head *cb)
d56d938b
PB
10900{
10901 struct io_uring_task *tctx = current->io_uring;
10902 struct io_tctx_exit *work;
10903
10904 work = container_of(cb, struct io_tctx_exit, task_work);
10905 /*
10906 * When @in_idle, we're in cancellation and it's racy to remove the
10907 * node. It'll be removed by the end of cancellation, just ignore it.
10908 */
10909 if (!atomic_read(&tctx->in_idle))
eef51daa 10910 io_uring_del_tctx_node((unsigned long)work->ctx);
d56d938b
PB
10911 complete(&work->completion);
10912}
10913
c072481d 10914static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
28090c13
PB
10915{
10916 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
10917
10918 return req->ctx == data;
10919}
10920
c072481d 10921static __cold void io_ring_exit_work(struct work_struct *work)
85faa7b8 10922{
d56d938b 10923 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
b5bb3a24 10924 unsigned long timeout = jiffies + HZ * 60 * 5;
58d3be2c 10925 unsigned long interval = HZ / 20;
d56d938b
PB
10926 struct io_tctx_exit exit;
10927 struct io_tctx_node *node;
10928 int ret;
85faa7b8 10929
56952e91
JA
10930 /*
10931 * If we're doing polled IO and end up having requests being
10932 * submitted async (out-of-line), then completions can come in while
10933 * we're waiting for refs to drop. We need to reap these manually,
10934 * as nobody else will be looking for them.
10935 */
b2edc0a7 10936 do {
3dd0c97a 10937 io_uring_try_cancel_requests(ctx, NULL, true);
28090c13
PB
10938 if (ctx->sq_data) {
10939 struct io_sq_data *sqd = ctx->sq_data;
10940 struct task_struct *tsk;
10941
10942 io_sq_thread_park(sqd);
10943 tsk = sqd->thread;
10944 if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
10945 io_wq_cancel_cb(tsk->io_uring->io_wq,
10946 io_cancel_ctx_cb, ctx, true);
10947 io_sq_thread_unpark(sqd);
10948 }
b5bb3a24 10949
37f0e767
PB
10950 io_req_caches_free(ctx);
10951
58d3be2c
PB
10952 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
10953 /* there is little hope left, don't run it too often */
10954 interval = HZ * 60;
10955 }
10956 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
d56d938b 10957
7f00651a
PB
10958 init_completion(&exit.completion);
10959 init_task_work(&exit.task_work, io_tctx_exit_cb);
10960 exit.ctx = ctx;
89b5066e
PB
10961 /*
10962 * Some may use context even when all refs and requests have been put,
10963 * and they are free to do so while still holding uring_lock or
5b0a6acc 10964 * completion_lock, see io_req_task_submit(). Apart from other work,
89b5066e
PB
10965 * this lock/unlock section also waits them to finish.
10966 */
d56d938b
PB
10967 mutex_lock(&ctx->uring_lock);
10968 while (!list_empty(&ctx->tctx_list)) {
b5bb3a24
PB
10969 WARN_ON_ONCE(time_after(jiffies, timeout));
10970
d56d938b
PB
10971 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
10972 ctx_node);
7f00651a
PB
10973 /* don't spin on a single task if cancellation failed */
10974 list_rotate_left(&ctx->tctx_list);
d56d938b
PB
10975 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
10976 if (WARN_ON_ONCE(ret))
10977 continue;
d56d938b
PB
10978
10979 mutex_unlock(&ctx->uring_lock);
10980 wait_for_completion(&exit.completion);
d56d938b
PB
10981 mutex_lock(&ctx->uring_lock);
10982 }
10983 mutex_unlock(&ctx->uring_lock);
79ebeaee
JA
10984 spin_lock(&ctx->completion_lock);
10985 spin_unlock(&ctx->completion_lock);
d56d938b 10986
85faa7b8
JA
10987 io_ring_ctx_free(ctx);
10988}
10989
80c4cbdb 10990/* Returns true if we found and killed one or more timeouts */
c072481d
PB
10991static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
10992 struct task_struct *tsk, bool cancel_all)
80c4cbdb
PB
10993{
10994 struct io_kiocb *req, *tmp;
10995 int canceled = 0;
10996
79ebeaee
JA
10997 spin_lock(&ctx->completion_lock);
10998 spin_lock_irq(&ctx->timeout_lock);
80c4cbdb 10999 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
3dd0c97a 11000 if (io_match_task(req, tsk, cancel_all)) {
80c4cbdb
PB
11001 io_kill_timeout(req, -ECANCELED);
11002 canceled++;
11003 }
11004 }
79ebeaee 11005 spin_unlock_irq(&ctx->timeout_lock);
60053be8 11006 io_commit_cqring(ctx);
79ebeaee 11007 spin_unlock(&ctx->completion_lock);
80c4cbdb
PB
11008 if (canceled != 0)
11009 io_cqring_ev_posted(ctx);
11010 return canceled != 0;
11011}
11012
c072481d 11013static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2b188cc1 11014{
61cf9370
MWO
11015 unsigned long index;
11016 struct creds *creds;
11017
2b188cc1
JA
11018 mutex_lock(&ctx->uring_lock);
11019 percpu_ref_kill(&ctx->refs);
634578f8 11020 if (ctx->rings)
6c2450ae 11021 __io_cqring_overflow_flush(ctx, true);
61cf9370
MWO
11022 xa_for_each(&ctx->personalities, index, creds)
11023 io_unregister_personality(ctx, index);
2b188cc1
JA
11024 mutex_unlock(&ctx->uring_lock);
11025
60053be8
PB
11026 /* failed during ring init, it couldn't have issued any requests */
11027 if (ctx->rings) {
11028 io_kill_timeouts(ctx, NULL, true);
11029 io_poll_remove_all(ctx, NULL, true);
11030 /* if we failed setting up the ctx, we might not have any rings */
11031 io_iopoll_try_reap_events(ctx);
11032 }
309fc03a 11033
85faa7b8 11034 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
fc666777
JA
11035 /*
11036 * Use system_unbound_wq to avoid spawning tons of event kworkers
11037 * if we're exiting a ton of rings at the same time. It just adds
11038 * noise and overhead, there's no discernable change in runtime
11039 * over using system_wq.
11040 */
11041 queue_work(system_unbound_wq, &ctx->exit_work);
2b188cc1
JA
11042}
11043
11044static int io_uring_release(struct inode *inode, struct file *file)
11045{
11046 struct io_ring_ctx *ctx = file->private_data;
11047
11048 file->private_data = NULL;
11049 io_ring_ctx_wait_and_kill(ctx);
11050 return 0;
11051}
11052
f6edbabb
PB
11053struct io_task_cancel {
11054 struct task_struct *task;
3dd0c97a 11055 bool all;
f6edbabb 11056};
f254ac04 11057
f6edbabb 11058static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
b711d4ea 11059{
9a472ef7 11060 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
f6edbabb 11061 struct io_task_cancel *cancel = data;
9a472ef7 11062
6af3f48b 11063 return io_match_task_safe(req, cancel->task, cancel->all);
b711d4ea
JA
11064}
11065
c072481d
PB
11066static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
11067 struct task_struct *task,
11068 bool cancel_all)
b7ddce3c 11069{
e1915f76 11070 struct io_defer_entry *de;
b7ddce3c
PB
11071 LIST_HEAD(list);
11072
79ebeaee 11073 spin_lock(&ctx->completion_lock);
b7ddce3c 11074 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
6af3f48b 11075 if (io_match_task_safe(de->req, task, cancel_all)) {
b7ddce3c
PB
11076 list_cut_position(&list, &ctx->defer_list, &de->list);
11077 break;
11078 }
11079 }
79ebeaee 11080 spin_unlock(&ctx->completion_lock);
e1915f76
PB
11081 if (list_empty(&list))
11082 return false;
b7ddce3c
PB
11083
11084 while (!list_empty(&list)) {
11085 de = list_first_entry(&list, struct io_defer_entry, list);
11086 list_del_init(&de->list);
f41db273 11087 io_req_complete_failed(de->req, -ECANCELED);
b7ddce3c
PB
11088 kfree(de);
11089 }
e1915f76 11090 return true;
b7ddce3c
PB
11091}
11092
c072481d 11093static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
1b00764f
PB
11094{
11095 struct io_tctx_node *node;
11096 enum io_wq_cancel cret;
11097 bool ret = false;
11098
11099 mutex_lock(&ctx->uring_lock);
11100 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
11101 struct io_uring_task *tctx = node->task->io_uring;
11102
11103 /*
11104 * io_wq will stay alive while we hold uring_lock, because it's
11105 * killed after ctx nodes, which requires to take the lock.
11106 */
11107 if (!tctx || !tctx->io_wq)
11108 continue;
11109 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
11110 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
11111 }
11112 mutex_unlock(&ctx->uring_lock);
11113
11114 return ret;
11115}
11116
c072481d
PB
11117static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
11118 struct task_struct *task,
11119 bool cancel_all)
9936c7c2 11120{
3dd0c97a 11121 struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
1b00764f 11122 struct io_uring_task *tctx = task ? task->io_uring : NULL;
9936c7c2 11123
60053be8
PB
11124 /* failed during ring init, it couldn't have issued any requests */
11125 if (!ctx->rings)
11126 return;
11127
9936c7c2
PB
11128 while (1) {
11129 enum io_wq_cancel cret;
11130 bool ret = false;
11131
1b00764f
PB
11132 if (!task) {
11133 ret |= io_uring_try_cancel_iowq(ctx);
11134 } else if (tctx && tctx->io_wq) {
11135 /*
11136 * Cancels requests of all rings, not only @ctx, but
11137 * it's fine as the task is in exit/exec.
11138 */
5aa75ed5 11139 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9936c7c2
PB
11140 &cancel, true);
11141 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
11142 }
11143
11144 /* SQPOLL thread does its own polling */
3dd0c97a 11145 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
d052d1d6 11146 (ctx->sq_data && ctx->sq_data->thread == current)) {
5eef4e87 11147 while (!wq_list_empty(&ctx->iopoll_list)) {
9936c7c2
PB
11148 io_iopoll_try_reap_events(ctx);
11149 ret = true;
11150 }
11151 }
11152
3dd0c97a
PB
11153 ret |= io_cancel_defer_files(ctx, task, cancel_all);
11154 ret |= io_poll_remove_all(ctx, task, cancel_all);
11155 ret |= io_kill_timeouts(ctx, task, cancel_all);
e5dc480d
PB
11156 if (task)
11157 ret |= io_run_task_work();
9936c7c2
PB
11158 if (!ret)
11159 break;
11160 cond_resched();
11161 }
11162}
11163
eef51daa 11164static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
0f212204 11165{
236434c3 11166 struct io_uring_task *tctx = current->io_uring;
13bf43f5 11167 struct io_tctx_node *node;
a528b04e 11168 int ret;
236434c3
MWO
11169
11170 if (unlikely(!tctx)) {
5aa75ed5 11171 ret = io_uring_alloc_task_context(current, ctx);
0f212204
JA
11172 if (unlikely(ret))
11173 return ret;
e139a1ec 11174
236434c3 11175 tctx = current->io_uring;
e139a1ec
PB
11176 if (ctx->iowq_limits_set) {
11177 unsigned int limits[2] = { ctx->iowq_limits[0],
11178 ctx->iowq_limits[1], };
11179
11180 ret = io_wq_max_workers(tctx->io_wq, limits);
11181 if (ret)
11182 return ret;
11183 }
0f212204 11184 }
cf27f3b1
PB
11185 if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
11186 node = kmalloc(sizeof(*node), GFP_KERNEL);
11187 if (!node)
11188 return -ENOMEM;
11189 node->ctx = ctx;
11190 node->task = current;
13bf43f5 11191
cf27f3b1
PB
11192 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
11193 node, GFP_KERNEL));
11194 if (ret) {
11195 kfree(node);
11196 return ret;
0f212204 11197 }
cf27f3b1
PB
11198
11199 mutex_lock(&ctx->uring_lock);
11200 list_add(&node->ctx_node, &ctx->tctx_list);
11201 mutex_unlock(&ctx->uring_lock);
0f212204 11202 }
cf27f3b1 11203 tctx->last = ctx;
0f212204
JA
11204 return 0;
11205}
11206
cf27f3b1
PB
11207/*
11208 * Note that this task has used io_uring. We use it for cancelation purposes.
11209 */
eef51daa 11210static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
cf27f3b1
PB
11211{
11212 struct io_uring_task *tctx = current->io_uring;
11213
11214 if (likely(tctx && tctx->last == ctx))
11215 return 0;
eef51daa 11216 return __io_uring_add_tctx_node(ctx);
cf27f3b1
PB
11217}
11218
0f212204
JA
11219/*
11220 * Remove this io_uring_file -> task mapping.
11221 */
c072481d 11222static __cold void io_uring_del_tctx_node(unsigned long index)
0f212204
JA
11223{
11224 struct io_uring_task *tctx = current->io_uring;
13bf43f5 11225 struct io_tctx_node *node;
2941267b 11226
eebd2e37
PB
11227 if (!tctx)
11228 return;
13bf43f5
PB
11229 node = xa_erase(&tctx->xa, index);
11230 if (!node)
2941267b 11231 return;
0f212204 11232
13bf43f5
PB
11233 WARN_ON_ONCE(current != node->task);
11234 WARN_ON_ONCE(list_empty(&node->ctx_node));
11235
11236 mutex_lock(&node->ctx->uring_lock);
11237 list_del(&node->ctx_node);
11238 mutex_unlock(&node->ctx->uring_lock);
11239
baf186c4 11240 if (tctx->last == node->ctx)
0f212204 11241 tctx->last = NULL;
13bf43f5 11242 kfree(node);
0f212204
JA
11243}
11244
c072481d 11245static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
de7f1d9e 11246{
ba5ef6dc 11247 struct io_wq *wq = tctx->io_wq;
13bf43f5 11248 struct io_tctx_node *node;
de7f1d9e
PB
11249 unsigned long index;
11250
8bab4c09 11251 xa_for_each(&tctx->xa, index, node) {
eef51daa 11252 io_uring_del_tctx_node(index);
8bab4c09
JA
11253 cond_resched();
11254 }
b16ef427
ME
11255 if (wq) {
11256 /*
f6f9b278 11257 * Must be after io_uring_del_tctx_node() (removes nodes under
b16ef427
ME
11258 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
11259 */
ba5ef6dc 11260 io_wq_put_and_exit(wq);
dadebc35 11261 tctx->io_wq = NULL;
b16ef427 11262 }
de7f1d9e
PB
11263}
11264
3f48cf18 11265static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
521d6a73 11266{
3f48cf18 11267 if (tracked)
d5361233 11268 return 0;
521d6a73
PB
11269 return percpu_counter_sum(&tctx->inflight);
11270}
11271
78cc687b
PB
11272/*
11273 * Find any io_uring ctx that this task has registered or done IO on, and cancel
78a78060 11274 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
78cc687b 11275 */
c072481d
PB
11276static __cold void io_uring_cancel_generic(bool cancel_all,
11277 struct io_sq_data *sqd)
0e9ddb39 11278{
521d6a73 11279 struct io_uring_task *tctx = current->io_uring;
734551df 11280 struct io_ring_ctx *ctx;
0e9ddb39
PB
11281 s64 inflight;
11282 DEFINE_WAIT(wait);
fdaf083c 11283
78cc687b
PB
11284 WARN_ON_ONCE(sqd && sqd->thread != current);
11285
6d042ffb
PO
11286 if (!current->io_uring)
11287 return;
17a91051
PB
11288 if (tctx->io_wq)
11289 io_wq_exit_start(tctx->io_wq);
11290
0e9ddb39
PB
11291 atomic_inc(&tctx->in_idle);
11292 do {
e9dbe221 11293 io_uring_drop_tctx_refs(current);
0e9ddb39 11294 /* read completions before cancelations */
78cc687b 11295 inflight = tctx_inflight(tctx, !cancel_all);
0e9ddb39
PB
11296 if (!inflight)
11297 break;
fdaf083c 11298
78cc687b
PB
11299 if (!sqd) {
11300 struct io_tctx_node *node;
11301 unsigned long index;
0f212204 11302
78cc687b
PB
11303 xa_for_each(&tctx->xa, index, node) {
11304 /* sqpoll task will cancel all its requests */
11305 if (node->ctx->sq_data)
11306 continue;
11307 io_uring_try_cancel_requests(node->ctx, current,
11308 cancel_all);
11309 }
11310 } else {
11311 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
11312 io_uring_try_cancel_requests(ctx, current,
11313 cancel_all);
11314 }
17a91051 11315
78a78060
JA
11316 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
11317 io_run_task_work();
e9dbe221 11318 io_uring_drop_tctx_refs(current);
78a78060 11319
0f212204 11320 /*
a1bb3cd5
PB
11321 * If we've seen completions, retry without waiting. This
11322 * avoids a race where a completion comes in before we did
11323 * prepare_to_wait().
0f212204 11324 */
3dd0c97a 11325 if (inflight == tctx_inflight(tctx, !cancel_all))
a1bb3cd5 11326 schedule();
f57555ed 11327 finish_wait(&tctx->wait, &wait);
d8a6df10 11328 } while (1);
de7f1d9e 11329
8452d4a6 11330 io_uring_clean_tctx(tctx);
3dd0c97a 11331 if (cancel_all) {
3cc7fdb9
PB
11332 /*
11333 * We shouldn't run task_works after cancel, so just leave
11334 * ->in_idle set for normal exit.
11335 */
11336 atomic_dec(&tctx->in_idle);
3f48cf18
PB
11337 /* for exec all current's requests should be gone, kill tctx */
11338 __io_uring_free(current);
11339 }
44e728b8
PB
11340}
11341
f552a27a 11342void __io_uring_cancel(bool cancel_all)
78cc687b 11343{
f552a27a 11344 io_uring_cancel_generic(cancel_all, NULL);
78cc687b
PB
11345}
11346
e7a6c00d
JA
11347void io_uring_unreg_ringfd(void)
11348{
11349 struct io_uring_task *tctx = current->io_uring;
11350 int i;
11351
11352 for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
11353 if (tctx->registered_rings[i]) {
11354 fput(tctx->registered_rings[i]);
11355 tctx->registered_rings[i] = NULL;
11356 }
11357 }
11358}
11359
11360static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
11361 int start, int end)
11362{
11363 struct file *file;
11364 int offset;
11365
11366 for (offset = start; offset < end; offset++) {
11367 offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
11368 if (tctx->registered_rings[offset])
11369 continue;
11370
11371 file = fget(fd);
11372 if (!file) {
11373 return -EBADF;
11374 } else if (file->f_op != &io_uring_fops) {
11375 fput(file);
11376 return -EOPNOTSUPP;
11377 }
11378 tctx->registered_rings[offset] = file;
11379 return offset;
11380 }
11381
11382 return -EBUSY;
11383}
11384
11385/*
11386 * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
11387 * invocation. User passes in an array of struct io_uring_rsrc_update
11388 * with ->data set to the ring_fd, and ->offset given for the desired
11389 * index. If no index is desired, application may set ->offset == -1U
11390 * and we'll find an available index. Returns number of entries
11391 * successfully processed, or < 0 on error if none were processed.
11392 */
11393static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
11394 unsigned nr_args)
11395{
11396 struct io_uring_rsrc_update __user *arg = __arg;
11397 struct io_uring_rsrc_update reg;
11398 struct io_uring_task *tctx;
11399 int ret, i;
11400
11401 if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
11402 return -EINVAL;
11403
11404 mutex_unlock(&ctx->uring_lock);
11405 ret = io_uring_add_tctx_node(ctx);
11406 mutex_lock(&ctx->uring_lock);
11407 if (ret)
11408 return ret;
11409
11410 tctx = current->io_uring;
11411 for (i = 0; i < nr_args; i++) {
11412 int start, end;
11413
11414 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
11415 ret = -EFAULT;
11416 break;
11417 }
11418
6fb53cf8
DY
11419 if (reg.resv) {
11420 ret = -EINVAL;
11421 break;
11422 }
11423
e7a6c00d
JA
11424 if (reg.offset == -1U) {
11425 start = 0;
11426 end = IO_RINGFD_REG_MAX;
11427 } else {
11428 if (reg.offset >= IO_RINGFD_REG_MAX) {
11429 ret = -EINVAL;
11430 break;
11431 }
11432 start = reg.offset;
11433 end = start + 1;
11434 }
11435
11436 ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
11437 if (ret < 0)
11438 break;
11439
11440 reg.offset = ret;
11441 if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
11442 fput(tctx->registered_rings[reg.offset]);
11443 tctx->registered_rings[reg.offset] = NULL;
11444 ret = -EFAULT;
11445 break;
11446 }
11447 }
11448
11449 return i ? i : ret;
11450}
11451
11452static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
11453 unsigned nr_args)
11454{
11455 struct io_uring_rsrc_update __user *arg = __arg;
11456 struct io_uring_task *tctx = current->io_uring;
11457 struct io_uring_rsrc_update reg;
11458 int ret = 0, i;
11459
11460 if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
11461 return -EINVAL;
11462 if (!tctx)
11463 return 0;
11464
11465 for (i = 0; i < nr_args; i++) {
11466 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
11467 ret = -EFAULT;
11468 break;
11469 }
303cc749 11470 if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
e7a6c00d
JA
11471 ret = -EINVAL;
11472 break;
11473 }
11474
11475 reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
11476 if (tctx->registered_rings[reg.offset]) {
11477 fput(tctx->registered_rings[reg.offset]);
11478 tctx->registered_rings[reg.offset] = NULL;
11479 }
11480 }
11481
11482 return i ? i : ret;
11483}
11484
6c5c240e
RP
11485static void *io_uring_validate_mmap_request(struct file *file,
11486 loff_t pgoff, size_t sz)
2b188cc1 11487{
2b188cc1 11488 struct io_ring_ctx *ctx = file->private_data;
6c5c240e 11489 loff_t offset = pgoff << PAGE_SHIFT;
2b188cc1
JA
11490 struct page *page;
11491 void *ptr;
11492
11493 switch (offset) {
11494 case IORING_OFF_SQ_RING:
75b28aff
HV
11495 case IORING_OFF_CQ_RING:
11496 ptr = ctx->rings;
2b188cc1
JA
11497 break;
11498 case IORING_OFF_SQES:
11499 ptr = ctx->sq_sqes;
11500 break;
2b188cc1 11501 default:
6c5c240e 11502 return ERR_PTR(-EINVAL);
2b188cc1
JA
11503 }
11504
11505 page = virt_to_head_page(ptr);
a50b854e 11506 if (sz > page_size(page))
6c5c240e
RP
11507 return ERR_PTR(-EINVAL);
11508
11509 return ptr;
11510}
11511
11512#ifdef CONFIG_MMU
11513
c072481d 11514static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
6c5c240e
RP
11515{
11516 size_t sz = vma->vm_end - vma->vm_start;
11517 unsigned long pfn;
11518 void *ptr;
11519
11520 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
11521 if (IS_ERR(ptr))
11522 return PTR_ERR(ptr);
2b188cc1
JA
11523
11524 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
11525 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
11526}
11527
6c5c240e
RP
11528#else /* !CONFIG_MMU */
11529
11530static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
11531{
11532 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
11533}
11534
11535static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
11536{
11537 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
11538}
11539
11540static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
11541 unsigned long addr, unsigned long len,
11542 unsigned long pgoff, unsigned long flags)
11543{
11544 void *ptr;
11545
11546 ptr = io_uring_validate_mmap_request(file, pgoff, len);
11547 if (IS_ERR(ptr))
11548 return PTR_ERR(ptr);
11549
11550 return (unsigned long) ptr;
11551}
11552
11553#endif /* !CONFIG_MMU */
11554
d9d05217 11555static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
90554200
JA
11556{
11557 DEFINE_WAIT(wait);
11558
11559 do {
11560 if (!io_sqring_full(ctx))
11561 break;
90554200
JA
11562 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
11563
11564 if (!io_sqring_full(ctx))
11565 break;
90554200
JA
11566 schedule();
11567 } while (!signal_pending(current));
11568
11569 finish_wait(&ctx->sqo_sq_wait, &wait);
5199328a 11570 return 0;
90554200
JA
11571}
11572
f81440d3
PB
11573static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
11574{
11575 if (flags & IORING_ENTER_EXT_ARG) {
11576 struct io_uring_getevents_arg arg;
11577
11578 if (argsz != sizeof(arg))
11579 return -EINVAL;
11580 if (copy_from_user(&arg, argp, sizeof(arg)))
11581 return -EFAULT;
11582 }
11583 return 0;
11584}
11585
c73ebb68
HX
11586static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
11587 struct __kernel_timespec __user **ts,
11588 const sigset_t __user **sig)
11589{
11590 struct io_uring_getevents_arg arg;
11591
11592 /*
11593 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
11594 * is just a pointer to the sigset_t.
11595 */
11596 if (!(flags & IORING_ENTER_EXT_ARG)) {
11597 *sig = (const sigset_t __user *) argp;
11598 *ts = NULL;
11599 return 0;
11600 }
11601
11602 /*
11603 * EXT_ARG is set - ensure we agree on the size of it and copy in our
11604 * timespec and sigset_t pointers if good.
11605 */
11606 if (*argsz != sizeof(arg))
11607 return -EINVAL;
11608 if (copy_from_user(&arg, argp, sizeof(arg)))
11609 return -EFAULT;
d2347b96
DY
11610 if (arg.pad)
11611 return -EINVAL;
c73ebb68
HX
11612 *sig = u64_to_user_ptr(arg.sigmask);
11613 *argsz = arg.sigmask_sz;
11614 *ts = u64_to_user_ptr(arg.ts);
11615 return 0;
11616}
11617
2b188cc1 11618SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
c73ebb68
HX
11619 u32, min_complete, u32, flags, const void __user *, argp,
11620 size_t, argsz)
2b188cc1
JA
11621{
11622 struct io_ring_ctx *ctx;
2b188cc1 11623 struct fd f;
33f993da 11624 long ret;
2b188cc1 11625
4c6e277c 11626 io_run_task_work();
b41e9852 11627
33f993da 11628 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
e7a6c00d
JA
11629 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
11630 IORING_ENTER_REGISTERED_RING)))
2b188cc1
JA
11631 return -EINVAL;
11632
e7a6c00d
JA
11633 /*
11634 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
11635 * need only dereference our task private array to find it.
11636 */
11637 if (flags & IORING_ENTER_REGISTERED_RING) {
11638 struct io_uring_task *tctx = current->io_uring;
11639
11640 if (!tctx || fd >= IO_RINGFD_REG_MAX)
11641 return -EINVAL;
11642 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
11643 f.file = tctx->registered_rings[fd];
11644 if (unlikely(!f.file))
11645 return -EBADF;
11646 } else {
11647 f = fdget(fd);
11648 if (unlikely(!f.file))
11649 return -EBADF;
11650 }
2b188cc1
JA
11651
11652 ret = -EOPNOTSUPP;
33f993da 11653 if (unlikely(f.file->f_op != &io_uring_fops))
2b188cc1
JA
11654 goto out_fput;
11655
11656 ret = -ENXIO;
11657 ctx = f.file->private_data;
33f993da 11658 if (unlikely(!percpu_ref_tryget(&ctx->refs)))
2b188cc1
JA
11659 goto out_fput;
11660
7e84e1c7 11661 ret = -EBADFD;
33f993da 11662 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
7e84e1c7
SG
11663 goto out;
11664
6c271ce2
JA
11665 /*
11666 * For SQ polling, the thread will do all submissions and completions.
11667 * Just return the requested submit count, and wake the thread if
11668 * we were asked to.
11669 */
b2a9eada 11670 ret = 0;
6c271ce2 11671 if (ctx->flags & IORING_SETUP_SQPOLL) {
90f67366 11672 io_cqring_overflow_flush(ctx);
89448c47 11673
21f96522
JA
11674 if (unlikely(ctx->sq_data->thread == NULL)) {
11675 ret = -EOWNERDEAD;
04147488 11676 goto out;
21f96522 11677 }
6c271ce2 11678 if (flags & IORING_ENTER_SQ_WAKEUP)
534ca6d6 11679 wake_up(&ctx->sq_data->wait);
d9d05217
PB
11680 if (flags & IORING_ENTER_SQ_WAIT) {
11681 ret = io_sqpoll_wait_sq(ctx);
11682 if (ret)
11683 goto out;
11684 }
3e813c90 11685 ret = to_submit;
b2a9eada 11686 } else if (to_submit) {
eef51daa 11687 ret = io_uring_add_tctx_node(ctx);
0f212204
JA
11688 if (unlikely(ret))
11689 goto out;
7c504e65 11690
2b188cc1 11691 mutex_lock(&ctx->uring_lock);
3e813c90
DY
11692 ret = io_submit_sqes(ctx, to_submit);
11693 if (ret != to_submit) {
d487b43c 11694 mutex_unlock(&ctx->uring_lock);
7c504e65 11695 goto out;
d487b43c
PB
11696 }
11697 if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
11698 goto iopoll_locked;
11699 mutex_unlock(&ctx->uring_lock);
2b188cc1
JA
11700 }
11701 if (flags & IORING_ENTER_GETEVENTS) {
3e813c90 11702 int ret2;
773697b6 11703 if (ctx->syscall_iopoll) {
d487b43c
PB
11704 /*
11705 * We disallow the app entering submit/complete with
11706 * polling, but we still need to lock the ring to
11707 * prevent racing with polled issue that got punted to
11708 * a workqueue.
11709 */
11710 mutex_lock(&ctx->uring_lock);
11711iopoll_locked:
3e813c90
DY
11712 ret2 = io_validate_ext_arg(flags, argp, argsz);
11713 if (likely(!ret2)) {
11714 min_complete = min(min_complete,
11715 ctx->cq_entries);
11716 ret2 = io_iopoll_check(ctx, min_complete);
d487b43c
PB
11717 }
11718 mutex_unlock(&ctx->uring_lock);
def596e9 11719 } else {
f81440d3
PB
11720 const sigset_t __user *sig;
11721 struct __kernel_timespec __user *ts;
11722
3e813c90
DY
11723 ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
11724 if (likely(!ret2)) {
11725 min_complete = min(min_complete,
11726 ctx->cq_entries);
11727 ret2 = io_cqring_wait(ctx, min_complete, sig,
11728 argsz, ts);
11729 }
def596e9 11730 }
c73ebb68 11731
155bc950 11732 if (!ret) {
3e813c90 11733 ret = ret2;
2b188cc1 11734
155bc950
DY
11735 /*
11736 * EBADR indicates that one or more CQE were dropped.
11737 * Once the user has been informed we can clear the bit
11738 * as they are obviously ok with those drops.
11739 */
11740 if (unlikely(ret2 == -EBADR))
11741 clear_bit(IO_CHECK_CQ_DROPPED_BIT,
11742 &ctx->check_cq);
def596e9 11743 }
2b188cc1
JA
11744 }
11745
7c504e65 11746out:
6805b32e 11747 percpu_ref_put(&ctx->refs);
2b188cc1 11748out_fput:
e7a6c00d
JA
11749 if (!(flags & IORING_ENTER_REGISTERED_RING))
11750 fdput(f);
3e813c90 11751 return ret;
2b188cc1
JA
11752}
11753
bebdb65e 11754#ifdef CONFIG_PROC_FS
c072481d 11755static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
61cf9370 11756 const struct cred *cred)
87ce955b 11757{
87ce955b
JA
11758 struct user_namespace *uns = seq_user_ns(m);
11759 struct group_info *gi;
11760 kernel_cap_t cap;
11761 unsigned __capi;
11762 int g;
11763
11764 seq_printf(m, "%5d\n", id);
11765 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
11766 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
11767 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
11768 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
11769 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
11770 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
11771 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
11772 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
11773 seq_puts(m, "\n\tGroups:\t");
11774 gi = cred->group_info;
11775 for (g = 0; g < gi->ngroups; g++) {
11776 seq_put_decimal_ull(m, g ? " " : "",
11777 from_kgid_munged(uns, gi->gid[g]));
11778 }
11779 seq_puts(m, "\n\tCapEff:\t");
11780 cap = cred->cap_effective;
11781 CAP_FOR_EACH_U32(__capi)
11782 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
11783 seq_putc(m, '\n');
11784 return 0;
11785}
11786
c072481d
PB
11787static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
11788 struct seq_file *m)
87ce955b 11789{
dbbe9c64 11790 struct io_sq_data *sq = NULL;
83f84356
HX
11791 struct io_overflow_cqe *ocqe;
11792 struct io_rings *r = ctx->rings;
11793 unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
83f84356
HX
11794 unsigned int sq_head = READ_ONCE(r->sq.head);
11795 unsigned int sq_tail = READ_ONCE(r->sq.tail);
11796 unsigned int cq_head = READ_ONCE(r->cq.head);
11797 unsigned int cq_tail = READ_ONCE(r->cq.tail);
f9b3dfcc 11798 unsigned int cq_shift = 0;
f75d1183 11799 unsigned int sq_entries, cq_entries;
fad8e0de 11800 bool has_lock;
f9b3dfcc 11801 bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
83f84356
HX
11802 unsigned int i;
11803
f9b3dfcc
SR
11804 if (is_cqe32)
11805 cq_shift = 1;
11806
83f84356
HX
11807 /*
11808 * we may get imprecise sqe and cqe info if uring is actively running
11809 * since we get cached_sq_head and cached_cq_tail without uring_lock
11810 * and sq_tail and cq_head are changed by userspace. But it's ok since
11811 * we usually use these info when it is stuck.
11812 */
c0235652 11813 seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
f75d1183
JA
11814 seq_printf(m, "SqHead:\t%u\n", sq_head);
11815 seq_printf(m, "SqTail:\t%u\n", sq_tail);
11816 seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
11817 seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
11818 seq_printf(m, "CqHead:\t%u\n", cq_head);
11819 seq_printf(m, "CqTail:\t%u\n", cq_tail);
11820 seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
11821 seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
11822 sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
11823 for (i = 0; i < sq_entries; i++) {
11824 unsigned int entry = i + sq_head;
11825 unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
a1957780 11826 struct io_uring_sqe *sqe;
f75d1183
JA
11827
11828 if (sq_idx > sq_mask)
11829 continue;
11830 sqe = &ctx->sq_sqes[sq_idx];
11831 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
11832 sq_idx, sqe->opcode, sqe->fd, sqe->flags,
11833 sqe->user_data);
83f84356 11834 }
f75d1183
JA
11835 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
11836 cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
11837 for (i = 0; i < cq_entries; i++) {
11838 unsigned int entry = i + cq_head;
f9b3dfcc 11839 struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
83f84356 11840
f9b3dfcc
SR
11841 if (!is_cqe32) {
11842 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
f75d1183
JA
11843 entry & cq_mask, cqe->user_data, cqe->res,
11844 cqe->flags);
f9b3dfcc
SR
11845 } else {
11846 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
11847 "extra1:%llu, extra2:%llu\n",
11848 entry & cq_mask, cqe->user_data, cqe->res,
11849 cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
11850 }
83f84356 11851 }
87ce955b 11852
fad8e0de
JA
11853 /*
11854 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
11855 * since fdinfo case grabs it in the opposite direction of normal use
11856 * cases. If we fail to get the lock, we just don't iterate any
11857 * structures that could be going away outside the io_uring mutex.
11858 */
11859 has_lock = mutex_trylock(&ctx->uring_lock);
11860
5f3f26f9 11861 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
dbbe9c64 11862 sq = ctx->sq_data;
5f3f26f9
JA
11863 if (!sq->thread)
11864 sq = NULL;
11865 }
dbbe9c64
JQ
11866
11867 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
11868 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
87ce955b 11869 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
fad8e0de 11870 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
7b29f92d 11871 struct file *f = io_file_from_index(ctx, i);
87ce955b 11872
87ce955b
JA
11873 if (f)
11874 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
11875 else
11876 seq_printf(m, "%5u: <none>\n", i);
11877 }
11878 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
fad8e0de 11879 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
41edf1a5 11880 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
4751f53d 11881 unsigned int len = buf->ubuf_end - buf->ubuf;
87ce955b 11882
4751f53d 11883 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
87ce955b 11884 }
61cf9370
MWO
11885 if (has_lock && !xa_empty(&ctx->personalities)) {
11886 unsigned long index;
11887 const struct cred *cred;
11888
87ce955b 11889 seq_printf(m, "Personalities:\n");
61cf9370
MWO
11890 xa_for_each(&ctx->personalities, index, cred)
11891 io_uring_show_cred(m, index, cred);
87ce955b 11892 }
83f84356
HX
11893 if (has_lock)
11894 mutex_unlock(&ctx->uring_lock);
11895
11896 seq_puts(m, "PollList:\n");
79ebeaee 11897 spin_lock(&ctx->completion_lock);
d7718a9d
JA
11898 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
11899 struct hlist_head *list = &ctx->cancel_hash[i];
11900 struct io_kiocb *req;
11901
11902 hlist_for_each_entry(req, list, hash_node)
11903 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
7f62d40d 11904 task_work_pending(req->task));
d7718a9d 11905 }
83f84356
HX
11906
11907 seq_puts(m, "CqOverflowList:\n");
11908 list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
11909 struct io_uring_cqe *cqe = &ocqe->cqe;
11910
11911 seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
11912 cqe->user_data, cqe->res, cqe->flags);
11913
11914 }
11915
79ebeaee 11916 spin_unlock(&ctx->completion_lock);
87ce955b
JA
11917}
11918
c072481d 11919static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
87ce955b
JA
11920{
11921 struct io_ring_ctx *ctx = f->private_data;
11922
11923 if (percpu_ref_tryget(&ctx->refs)) {
11924 __io_uring_show_fdinfo(ctx, m);
11925 percpu_ref_put(&ctx->refs);
11926 }
11927}
bebdb65e 11928#endif
87ce955b 11929
2b188cc1
JA
11930static const struct file_operations io_uring_fops = {
11931 .release = io_uring_release,
11932 .mmap = io_uring_mmap,
6c5c240e
RP
11933#ifndef CONFIG_MMU
11934 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
11935 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
11936#endif
2b188cc1 11937 .poll = io_uring_poll,
bebdb65e 11938#ifdef CONFIG_PROC_FS
87ce955b 11939 .show_fdinfo = io_uring_show_fdinfo,
bebdb65e 11940#endif
2b188cc1
JA
11941};
11942
c072481d
PB
11943static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
11944 struct io_uring_params *p)
2b188cc1 11945{
75b28aff
HV
11946 struct io_rings *rings;
11947 size_t size, sq_array_offset;
2b188cc1 11948
bd740481
JA
11949 /* make sure these are sane, as we already accounted them */
11950 ctx->sq_entries = p->sq_entries;
11951 ctx->cq_entries = p->cq_entries;
11952
baf9cb64 11953 size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
75b28aff
HV
11954 if (size == SIZE_MAX)
11955 return -EOVERFLOW;
11956
11957 rings = io_mem_alloc(size);
11958 if (!rings)
2b188cc1
JA
11959 return -ENOMEM;
11960
75b28aff
HV
11961 ctx->rings = rings;
11962 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
11963 rings->sq_ring_mask = p->sq_entries - 1;
11964 rings->cq_ring_mask = p->cq_entries - 1;
11965 rings->sq_ring_entries = p->sq_entries;
11966 rings->cq_ring_entries = p->cq_entries;
2b188cc1 11967
ebdeb7c0
JA
11968 if (p->flags & IORING_SETUP_SQE128)
11969 size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
11970 else
11971 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
eb065d30
JA
11972 if (size == SIZE_MAX) {
11973 io_mem_free(ctx->rings);
11974 ctx->rings = NULL;
2b188cc1 11975 return -EOVERFLOW;
eb065d30 11976 }
2b188cc1
JA
11977
11978 ctx->sq_sqes = io_mem_alloc(size);
eb065d30
JA
11979 if (!ctx->sq_sqes) {
11980 io_mem_free(ctx->rings);
11981 ctx->rings = NULL;
2b188cc1 11982 return -ENOMEM;
eb065d30 11983 }
2b188cc1 11984
2b188cc1
JA
11985 return 0;
11986}
11987
9faadcc8
PB
11988static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
11989{
11990 int ret, fd;
11991
11992 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
11993 if (fd < 0)
11994 return fd;
11995
eef51daa 11996 ret = io_uring_add_tctx_node(ctx);
9faadcc8
PB
11997 if (ret) {
11998 put_unused_fd(fd);
11999 return ret;
12000 }
12001 fd_install(fd, file);
12002 return fd;
12003}
12004
2b188cc1
JA
12005/*
12006 * Allocate an anonymous fd, this is what constitutes the application
12007 * visible backing of an io_uring instance. The application mmaps this
12008 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
12009 * we have to tie this fd to a socket for file garbage collection purposes.
12010 */
9faadcc8 12011static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
2b188cc1
JA
12012{
12013 struct file *file;
9faadcc8 12014#if defined(CONFIG_UNIX)
2b188cc1
JA
12015 int ret;
12016
2b188cc1
JA
12017 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
12018 &ctx->ring_sock);
12019 if (ret)
9faadcc8 12020 return ERR_PTR(ret);
2b188cc1
JA
12021#endif
12022
91a9ab7c
PM
12023 file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
12024 O_RDWR | O_CLOEXEC, NULL);
2b188cc1 12025#if defined(CONFIG_UNIX)
9faadcc8
PB
12026 if (IS_ERR(file)) {
12027 sock_release(ctx->ring_sock);
12028 ctx->ring_sock = NULL;
12029 } else {
12030 ctx->ring_sock->file = file;
0f212204 12031 }
2b188cc1 12032#endif
9faadcc8 12033 return file;
2b188cc1
JA
12034}
12035
c072481d
PB
12036static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
12037 struct io_uring_params __user *params)
2b188cc1 12038{
2b188cc1 12039 struct io_ring_ctx *ctx;
9faadcc8 12040 struct file *file;
2b188cc1
JA
12041 int ret;
12042
8110c1a6 12043 if (!entries)
2b188cc1 12044 return -EINVAL;
8110c1a6
JA
12045 if (entries > IORING_MAX_ENTRIES) {
12046 if (!(p->flags & IORING_SETUP_CLAMP))
12047 return -EINVAL;
12048 entries = IORING_MAX_ENTRIES;
12049 }
2b188cc1
JA
12050
12051 /*
12052 * Use twice as many entries for the CQ ring. It's possible for the
12053 * application to drive a higher depth than the size of the SQ ring,
12054 * since the sqes are only used at submission time. This allows for
33a107f0
JA
12055 * some flexibility in overcommitting a bit. If the application has
12056 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
12057 * of CQ ring entries manually.
2b188cc1
JA
12058 */
12059 p->sq_entries = roundup_pow_of_two(entries);
33a107f0
JA
12060 if (p->flags & IORING_SETUP_CQSIZE) {
12061 /*
12062 * If IORING_SETUP_CQSIZE is set, we do the same roundup
12063 * to a power-of-two, if it isn't already. We do NOT impose
12064 * any cq vs sq ring sizing.
12065 */
eb2667b3 12066 if (!p->cq_entries)
33a107f0 12067 return -EINVAL;
8110c1a6
JA
12068 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
12069 if (!(p->flags & IORING_SETUP_CLAMP))
12070 return -EINVAL;
12071 p->cq_entries = IORING_MAX_CQ_ENTRIES;
12072 }
eb2667b3
JQ
12073 p->cq_entries = roundup_pow_of_two(p->cq_entries);
12074 if (p->cq_entries < p->sq_entries)
12075 return -EINVAL;
33a107f0
JA
12076 } else {
12077 p->cq_entries = 2 * p->sq_entries;
12078 }
2b188cc1 12079
2b188cc1 12080 ctx = io_ring_ctx_alloc(p);
62e398be 12081 if (!ctx)
2b188cc1 12082 return -ENOMEM;
773697b6
PB
12083
12084 /*
12085 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
12086 * space applications don't need to do io completion events
12087 * polling again, they can rely on io_sq_thread to do polling
12088 * work, which can reduce cpu usage and uring_lock contention.
12089 */
12090 if (ctx->flags & IORING_SETUP_IOPOLL &&
12091 !(ctx->flags & IORING_SETUP_SQPOLL))
12092 ctx->syscall_iopoll = 1;
12093
2b188cc1 12094 ctx->compat = in_compat_syscall();
62e398be
JA
12095 if (!capable(CAP_IPC_LOCK))
12096 ctx->user = get_uid(current_user());
2aede0e4 12097
9f010507 12098 /*
e1169f06
JA
12099 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
12100 * COOP_TASKRUN is set, then IPIs are never needed by the app.
9f010507 12101 */
e1169f06
JA
12102 ret = -EINVAL;
12103 if (ctx->flags & IORING_SETUP_SQPOLL) {
12104 /* IPI related flags don't make sense with SQPOLL */
ef060ea9
JA
12105 if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
12106 IORING_SETUP_TASKRUN_FLAG))
e1169f06 12107 goto err;
9f010507 12108 ctx->notify_method = TWA_SIGNAL_NO_IPI;
e1169f06
JA
12109 } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
12110 ctx->notify_method = TWA_SIGNAL_NO_IPI;
12111 } else {
ef060ea9
JA
12112 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
12113 goto err;
9f010507 12114 ctx->notify_method = TWA_SIGNAL;
e1169f06 12115 }
9f010507 12116
2aede0e4
JA
12117 /*
12118 * This is just grabbed for accounting purposes. When a process exits,
12119 * the mm is exited and dropped before the files, hence we need to hang
12120 * on to this mm purely for the purposes of being able to unaccount
12121 * memory (locked/pinned vm). It's not used for anything else.
12122 */
6b7898eb 12123 mmgrab(current->mm);
2aede0e4 12124 ctx->mm_account = current->mm;
6b7898eb 12125
2b188cc1
JA
12126 ret = io_allocate_scq_urings(ctx, p);
12127 if (ret)
12128 goto err;
12129
7e84e1c7 12130 ret = io_sq_offload_create(ctx, p);
2b188cc1
JA
12131 if (ret)
12132 goto err;
eae071c9 12133 /* always set a rsrc node */
47b228ce
PB
12134 ret = io_rsrc_node_switch_start(ctx);
12135 if (ret)
12136 goto err;
eae071c9 12137 io_rsrc_node_switch(ctx, NULL);
2b188cc1 12138
2b188cc1 12139 memset(&p->sq_off, 0, sizeof(p->sq_off));
75b28aff
HV
12140 p->sq_off.head = offsetof(struct io_rings, sq.head);
12141 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
12142 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
12143 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
12144 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
12145 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
12146 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
2b188cc1
JA
12147
12148 memset(&p->cq_off, 0, sizeof(p->cq_off));
75b28aff
HV
12149 p->cq_off.head = offsetof(struct io_rings, cq.head);
12150 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
12151 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
12152 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
12153 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
12154 p->cq_off.cqes = offsetof(struct io_rings, cqes);
0d9b5b3a 12155 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
ac90f249 12156
7f13657d
XW
12157 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
12158 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
5769a351 12159 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
c73ebb68 12160 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9690557e 12161 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
c4212f3e
JA
12162 IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
12163 IORING_FEAT_LINKED_FILE;
7f13657d
XW
12164
12165 if (copy_to_user(params, p, sizeof(*p))) {
12166 ret = -EFAULT;
12167 goto err;
12168 }
d1719f70 12169
9faadcc8
PB
12170 file = io_uring_get_file(ctx);
12171 if (IS_ERR(file)) {
12172 ret = PTR_ERR(file);
12173 goto err;
12174 }
12175
044c1ab3
JA
12176 /*
12177 * Install ring fd as the very last thing, so we don't risk someone
12178 * having closed it before we finish setup
12179 */
9faadcc8
PB
12180 ret = io_uring_install_fd(ctx, file);
12181 if (ret < 0) {
12182 /* fput will clean it up */
12183 fput(file);
12184 return ret;
12185 }
044c1ab3 12186
c826bd7a 12187 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
2b188cc1
JA
12188 return ret;
12189err:
12190 io_ring_ctx_wait_and_kill(ctx);
12191 return ret;
12192}
12193
12194/*
12195 * Sets up an aio uring context, and returns the fd. Applications asks for a
12196 * ring size, we return the actual sq/cq ring sizes (among other things) in the
12197 * params structure passed in.
12198 */
12199static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
12200{
12201 struct io_uring_params p;
2b188cc1
JA
12202 int i;
12203
12204 if (copy_from_user(&p, params, sizeof(p)))
12205 return -EFAULT;
12206 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
12207 if (p.resv[i])
12208 return -EINVAL;
12209 }
12210
6c271ce2 12211 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8110c1a6 12212 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7e84e1c7 12213 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
e1169f06 12214 IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
ebdeb7c0 12215 IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
76c68fbf 12216 IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
2b188cc1
JA
12217 return -EINVAL;
12218
ef060ea9 12219 return io_uring_create(entries, &p, params);
2b188cc1
JA
12220}
12221
12222SYSCALL_DEFINE2(io_uring_setup, u32, entries,
12223 struct io_uring_params __user *, params)
12224{
12225 return io_uring_setup(entries, params);
12226}
12227
c072481d
PB
12228static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
12229 unsigned nr_args)
66f4af93
JA
12230{
12231 struct io_uring_probe *p;
12232 size_t size;
12233 int i, ret;
12234
12235 size = struct_size(p, ops, nr_args);
12236 if (size == SIZE_MAX)
12237 return -EOVERFLOW;
12238 p = kzalloc(size, GFP_KERNEL);
12239 if (!p)
12240 return -ENOMEM;
12241
12242 ret = -EFAULT;
12243 if (copy_from_user(p, arg, size))
12244 goto out;
12245 ret = -EINVAL;
12246 if (memchr_inv(p, 0, size))
12247 goto out;
12248
12249 p->last_op = IORING_OP_LAST - 1;
12250 if (nr_args > IORING_OP_LAST)
12251 nr_args = IORING_OP_LAST;
12252
12253 for (i = 0; i < nr_args; i++) {
12254 p->ops[i].op = i;
12255 if (!io_op_defs[i].not_supported)
12256 p->ops[i].flags = IO_URING_OP_SUPPORTED;
12257 }
12258 p->ops_len = i;
12259
12260 ret = 0;
12261 if (copy_to_user(arg, p, size))
12262 ret = -EFAULT;
12263out:
12264 kfree(p);
12265 return ret;
12266}
12267
071698e1
JA
12268static int io_register_personality(struct io_ring_ctx *ctx)
12269{
4379bf8b 12270 const struct cred *creds;
61cf9370 12271 u32 id;
1e6fa521 12272 int ret;
071698e1 12273
4379bf8b 12274 creds = get_current_cred();
1e6fa521 12275
61cf9370
MWO
12276 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
12277 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
a30f895a
JA
12278 if (ret < 0) {
12279 put_cred(creds);
12280 return ret;
12281 }
12282 return id;
071698e1
JA
12283}
12284
c072481d
PB
12285static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
12286 void __user *arg, unsigned int nr_args)
21b55dbc
SG
12287{
12288 struct io_uring_restriction *res;
12289 size_t size;
12290 int i, ret;
12291
7e84e1c7
SG
12292 /* Restrictions allowed only if rings started disabled */
12293 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
12294 return -EBADFD;
12295
21b55dbc 12296 /* We allow only a single restrictions registration */
7e84e1c7 12297 if (ctx->restrictions.registered)
21b55dbc
SG
12298 return -EBUSY;
12299
12300 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
12301 return -EINVAL;
12302
12303 size = array_size(nr_args, sizeof(*res));
12304 if (size == SIZE_MAX)
12305 return -EOVERFLOW;
12306
12307 res = memdup_user(arg, size);
12308 if (IS_ERR(res))
12309 return PTR_ERR(res);
12310
12311 ret = 0;
12312
12313 for (i = 0; i < nr_args; i++) {
12314 switch (res[i].opcode) {
12315 case IORING_RESTRICTION_REGISTER_OP:
12316 if (res[i].register_op >= IORING_REGISTER_LAST) {
12317 ret = -EINVAL;
12318 goto out;
12319 }
12320
12321 __set_bit(res[i].register_op,
12322 ctx->restrictions.register_op);
12323 break;
12324 case IORING_RESTRICTION_SQE_OP:
12325 if (res[i].sqe_op >= IORING_OP_LAST) {
12326 ret = -EINVAL;
12327 goto out;
12328 }
12329
12330 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
12331 break;
12332 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
12333 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
12334 break;
12335 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
12336 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
12337 break;
12338 default:
12339 ret = -EINVAL;
12340 goto out;
12341 }
12342 }
12343
12344out:
12345 /* Reset all restrictions if an error happened */
12346 if (ret != 0)
12347 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
12348 else
7e84e1c7 12349 ctx->restrictions.registered = true;
21b55dbc
SG
12350
12351 kfree(res);
12352 return ret;
12353}
12354
7e84e1c7
SG
12355static int io_register_enable_rings(struct io_ring_ctx *ctx)
12356{
12357 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
12358 return -EBADFD;
12359
12360 if (ctx->restrictions.registered)
12361 ctx->restricted = 1;
12362
0298ef96
PB
12363 ctx->flags &= ~IORING_SETUP_R_DISABLED;
12364 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
12365 wake_up(&ctx->sq_data->wait);
7e84e1c7
SG
12366 return 0;
12367}
12368
fdecb662 12369static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
c3bdad02 12370 struct io_uring_rsrc_update2 *up,
98f0b3b4
PB
12371 unsigned nr_args)
12372{
12373 __u32 tmp;
12374 int err;
12375
12376 if (check_add_overflow(up->offset, nr_args, &tmp))
12377 return -EOVERFLOW;
12378 err = io_rsrc_node_switch_start(ctx);
12379 if (err)
12380 return err;
12381
fdecb662
PB
12382 switch (type) {
12383 case IORING_RSRC_FILE:
98f0b3b4 12384 return __io_sqe_files_update(ctx, up, nr_args);
634d00df
PB
12385 case IORING_RSRC_BUFFER:
12386 return __io_sqe_buffers_update(ctx, up, nr_args);
98f0b3b4
PB
12387 }
12388 return -EINVAL;
12389}
12390
c3bdad02
PB
12391static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
12392 unsigned nr_args)
98f0b3b4 12393{
c3bdad02 12394 struct io_uring_rsrc_update2 up;
98f0b3b4
PB
12395
12396 if (!nr_args)
12397 return -EINVAL;
c3bdad02
PB
12398 memset(&up, 0, sizeof(up));
12399 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
12400 return -EFAULT;
d8a3ba9c 12401 if (up.resv || up.resv2)
565c5e61 12402 return -EINVAL;
c3bdad02
PB
12403 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
12404}
12405
12406static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
992da01a 12407 unsigned size, unsigned type)
c3bdad02
PB
12408{
12409 struct io_uring_rsrc_update2 up;
12410
12411 if (size != sizeof(up))
12412 return -EINVAL;
98f0b3b4
PB
12413 if (copy_from_user(&up, arg, sizeof(up)))
12414 return -EFAULT;
d8a3ba9c 12415 if (!up.nr || up.resv || up.resv2)
98f0b3b4 12416 return -EINVAL;
992da01a 12417 return __io_register_rsrc_update(ctx, type, &up, up.nr);
98f0b3b4
PB
12418}
12419
c072481d 12420static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
992da01a 12421 unsigned int size, unsigned int type)
792e3582
PB
12422{
12423 struct io_uring_rsrc_register rr;
12424
12425 /* keep it extendible */
12426 if (size != sizeof(rr))
12427 return -EINVAL;
12428
12429 memset(&rr, 0, sizeof(rr));
12430 if (copy_from_user(&rr, arg, size))
12431 return -EFAULT;
992da01a 12432 if (!rr.nr || rr.resv || rr.resv2)
792e3582
PB
12433 return -EINVAL;
12434
992da01a 12435 switch (type) {
792e3582
PB
12436 case IORING_RSRC_FILE:
12437 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
12438 rr.nr, u64_to_user_ptr(rr.tags));
634d00df
PB
12439 case IORING_RSRC_BUFFER:
12440 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
12441 rr.nr, u64_to_user_ptr(rr.tags));
792e3582
PB
12442 }
12443 return -EINVAL;
12444}
12445
c072481d
PB
12446static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
12447 void __user *arg, unsigned len)
fe76421d
JA
12448{
12449 struct io_uring_task *tctx = current->io_uring;
12450 cpumask_var_t new_mask;
12451 int ret;
12452
12453 if (!tctx || !tctx->io_wq)
12454 return -EINVAL;
12455
12456 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
12457 return -ENOMEM;
12458
12459 cpumask_clear(new_mask);
12460 if (len > cpumask_size())
12461 len = cpumask_size();
12462
0f5e4b83
ES
12463 if (in_compat_syscall()) {
12464 ret = compat_get_bitmap(cpumask_bits(new_mask),
12465 (const compat_ulong_t __user *)arg,
12466 len * 8 /* CHAR_BIT */);
12467 } else {
12468 ret = copy_from_user(new_mask, arg, len);
12469 }
12470
12471 if (ret) {
fe76421d
JA
12472 free_cpumask_var(new_mask);
12473 return -EFAULT;
12474 }
12475
12476 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
12477 free_cpumask_var(new_mask);
12478 return ret;
12479}
12480
c072481d 12481static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
fe76421d
JA
12482{
12483 struct io_uring_task *tctx = current->io_uring;
12484
12485 if (!tctx || !tctx->io_wq)
12486 return -EINVAL;
12487
12488 return io_wq_cpu_affinity(tctx->io_wq, NULL);
12489}
12490
c072481d
PB
12491static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
12492 void __user *arg)
b22fa62a 12493 __must_hold(&ctx->uring_lock)
2e480058 12494{
b22fa62a 12495 struct io_tctx_node *node;
fa84693b
JA
12496 struct io_uring_task *tctx = NULL;
12497 struct io_sq_data *sqd = NULL;
2e480058
JA
12498 __u32 new_count[2];
12499 int i, ret;
12500
2e480058
JA
12501 if (copy_from_user(new_count, arg, sizeof(new_count)))
12502 return -EFAULT;
12503 for (i = 0; i < ARRAY_SIZE(new_count); i++)
12504 if (new_count[i] > INT_MAX)
12505 return -EINVAL;
12506
fa84693b
JA
12507 if (ctx->flags & IORING_SETUP_SQPOLL) {
12508 sqd = ctx->sq_data;
12509 if (sqd) {
009ad9f0
JA
12510 /*
12511 * Observe the correct sqd->lock -> ctx->uring_lock
12512 * ordering. Fine to drop uring_lock here, we hold
12513 * a ref to the ctx.
12514 */
41d3a6bd 12515 refcount_inc(&sqd->refs);
009ad9f0 12516 mutex_unlock(&ctx->uring_lock);
fa84693b 12517 mutex_lock(&sqd->lock);
009ad9f0 12518 mutex_lock(&ctx->uring_lock);
41d3a6bd
JA
12519 if (sqd->thread)
12520 tctx = sqd->thread->io_uring;
fa84693b
JA
12521 }
12522 } else {
12523 tctx = current->io_uring;
12524 }
12525
e139a1ec 12526 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
fa84693b 12527
bad119b9
PB
12528 for (i = 0; i < ARRAY_SIZE(new_count); i++)
12529 if (new_count[i])
12530 ctx->iowq_limits[i] = new_count[i];
e139a1ec
PB
12531 ctx->iowq_limits_set = true;
12532
e139a1ec
PB
12533 if (tctx && tctx->io_wq) {
12534 ret = io_wq_max_workers(tctx->io_wq, new_count);
12535 if (ret)
12536 goto err;
12537 } else {
12538 memset(new_count, 0, sizeof(new_count));
12539 }
fa84693b 12540
41d3a6bd 12541 if (sqd) {
fa84693b 12542 mutex_unlock(&sqd->lock);
41d3a6bd
JA
12543 io_put_sq_data(sqd);
12544 }
2e480058
JA
12545
12546 if (copy_to_user(arg, new_count, sizeof(new_count)))
12547 return -EFAULT;
12548
b22fa62a
PB
12549 /* that's it for SQPOLL, only the SQPOLL task creates requests */
12550 if (sqd)
12551 return 0;
12552
12553 /* now propagate the restriction to all registered users */
12554 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
12555 struct io_uring_task *tctx = node->task->io_uring;
12556
12557 if (WARN_ON_ONCE(!tctx->io_wq))
12558 continue;
12559
12560 for (i = 0; i < ARRAY_SIZE(new_count); i++)
12561 new_count[i] = ctx->iowq_limits[i];
12562 /* ignore errors, it always returns zero anyway */
12563 (void)io_wq_max_workers(tctx->io_wq, new_count);
12564 }
2e480058 12565 return 0;
fa84693b 12566err:
41d3a6bd 12567 if (sqd) {
fa84693b 12568 mutex_unlock(&sqd->lock);
41d3a6bd
JA
12569 io_put_sq_data(sqd);
12570 }
fa84693b 12571 return ret;
2e480058
JA
12572}
12573
edafccee
JA
12574static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
12575 void __user *arg, unsigned nr_args)
b19062a5
JA
12576 __releases(ctx->uring_lock)
12577 __acquires(ctx->uring_lock)
edafccee
JA
12578{
12579 int ret;
12580
35fa71a0
JA
12581 /*
12582 * We're inside the ring mutex, if the ref is already dying, then
12583 * someone else killed the ctx or is already going through
12584 * io_uring_register().
12585 */
12586 if (percpu_ref_is_dying(&ctx->refs))
12587 return -ENXIO;
12588
75c4021a
PB
12589 if (ctx->restricted) {
12590 if (opcode >= IORING_REGISTER_LAST)
12591 return -EINVAL;
12592 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
12593 if (!test_bit(opcode, ctx->restrictions.register_op))
12594 return -EACCES;
12595 }
12596
edafccee
JA
12597 switch (opcode) {
12598 case IORING_REGISTER_BUFFERS:
634d00df 12599 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
edafccee
JA
12600 break;
12601 case IORING_UNREGISTER_BUFFERS:
12602 ret = -EINVAL;
12603 if (arg || nr_args)
12604 break;
0a96bbe4 12605 ret = io_sqe_buffers_unregister(ctx);
edafccee 12606 break;
6b06314c 12607 case IORING_REGISTER_FILES:
792e3582 12608 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
6b06314c
JA
12609 break;
12610 case IORING_UNREGISTER_FILES:
12611 ret = -EINVAL;
12612 if (arg || nr_args)
12613 break;
12614 ret = io_sqe_files_unregister(ctx);
12615 break;
c3a31e60 12616 case IORING_REGISTER_FILES_UPDATE:
c3bdad02 12617 ret = io_register_files_update(ctx, arg, nr_args);
c3a31e60 12618 break;
9b402849
JA
12619 case IORING_REGISTER_EVENTFD:
12620 ret = -EINVAL;
12621 if (nr_args != 1)
12622 break;
c75312dd
UA
12623 ret = io_eventfd_register(ctx, arg, 0);
12624 break;
12625 case IORING_REGISTER_EVENTFD_ASYNC:
12626 ret = -EINVAL;
12627 if (nr_args != 1)
f2842ab5 12628 break;
c75312dd 12629 ret = io_eventfd_register(ctx, arg, 1);
9b402849
JA
12630 break;
12631 case IORING_UNREGISTER_EVENTFD:
12632 ret = -EINVAL;
12633 if (arg || nr_args)
12634 break;
12635 ret = io_eventfd_unregister(ctx);
12636 break;
66f4af93
JA
12637 case IORING_REGISTER_PROBE:
12638 ret = -EINVAL;
12639 if (!arg || nr_args > 256)
12640 break;
12641 ret = io_probe(ctx, arg, nr_args);
12642 break;
071698e1
JA
12643 case IORING_REGISTER_PERSONALITY:
12644 ret = -EINVAL;
12645 if (arg || nr_args)
12646 break;
12647 ret = io_register_personality(ctx);
12648 break;
12649 case IORING_UNREGISTER_PERSONALITY:
12650 ret = -EINVAL;
12651 if (arg)
12652 break;
12653 ret = io_unregister_personality(ctx, nr_args);
12654 break;
7e84e1c7
SG
12655 case IORING_REGISTER_ENABLE_RINGS:
12656 ret = -EINVAL;
12657 if (arg || nr_args)
12658 break;
12659 ret = io_register_enable_rings(ctx);
12660 break;
21b55dbc
SG
12661 case IORING_REGISTER_RESTRICTIONS:
12662 ret = io_register_restrictions(ctx, arg, nr_args);
12663 break;
992da01a
PB
12664 case IORING_REGISTER_FILES2:
12665 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
12666 break;
12667 case IORING_REGISTER_FILES_UPDATE2:
12668 ret = io_register_rsrc_update(ctx, arg, nr_args,
12669 IORING_RSRC_FILE);
12670 break;
12671 case IORING_REGISTER_BUFFERS2:
12672 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
792e3582 12673 break;
992da01a
PB
12674 case IORING_REGISTER_BUFFERS_UPDATE:
12675 ret = io_register_rsrc_update(ctx, arg, nr_args,
12676 IORING_RSRC_BUFFER);
c3bdad02 12677 break;
fe76421d
JA
12678 case IORING_REGISTER_IOWQ_AFF:
12679 ret = -EINVAL;
12680 if (!arg || !nr_args)
12681 break;
12682 ret = io_register_iowq_aff(ctx, arg, nr_args);
12683 break;
12684 case IORING_UNREGISTER_IOWQ_AFF:
12685 ret = -EINVAL;
12686 if (arg || nr_args)
12687 break;
12688 ret = io_unregister_iowq_aff(ctx);
12689 break;
2e480058
JA
12690 case IORING_REGISTER_IOWQ_MAX_WORKERS:
12691 ret = -EINVAL;
12692 if (!arg || nr_args != 2)
12693 break;
12694 ret = io_register_iowq_max_workers(ctx, arg);
12695 break;
e7a6c00d
JA
12696 case IORING_REGISTER_RING_FDS:
12697 ret = io_ringfd_register(ctx, arg, nr_args);
12698 break;
12699 case IORING_UNREGISTER_RING_FDS:
12700 ret = io_ringfd_unregister(ctx, arg, nr_args);
12701 break;
edafccee
JA
12702 default:
12703 ret = -EINVAL;
12704 break;
12705 }
12706
edafccee
JA
12707 return ret;
12708}
12709
12710SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
12711 void __user *, arg, unsigned int, nr_args)
12712{
12713 struct io_ring_ctx *ctx;
12714 long ret = -EBADF;
12715 struct fd f;
12716
12717 f = fdget(fd);
12718 if (!f.file)
12719 return -EBADF;
12720
12721 ret = -EOPNOTSUPP;
12722 if (f.file->f_op != &io_uring_fops)
12723 goto out_fput;
12724
12725 ctx = f.file->private_data;
12726
b6c23dd5
PB
12727 io_run_task_work();
12728
edafccee
JA
12729 mutex_lock(&ctx->uring_lock);
12730 ret = __io_uring_register(ctx, opcode, arg, nr_args);
12731 mutex_unlock(&ctx->uring_lock);
2757be22 12732 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
edafccee
JA
12733out_fput:
12734 fdput(f);
12735 return ret;
12736}
12737
2b188cc1
JA
12738static int __init io_uring_init(void)
12739{
d7f62e82
SM
12740#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
12741 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
12742 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
12743} while (0)
12744
12745#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
12746 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
12747 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
12748 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
12749 BUILD_BUG_SQE_ELEM(1, __u8, flags);
12750 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
12751 BUILD_BUG_SQE_ELEM(4, __s32, fd);
12752 BUILD_BUG_SQE_ELEM(8, __u64, off);
12753 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
12754 BUILD_BUG_SQE_ELEM(16, __u64, addr);
7d67af2c 12755 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
d7f62e82
SM
12756 BUILD_BUG_SQE_ELEM(24, __u32, len);
12757 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
12758 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
12759 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
12760 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
5769a351
JX
12761 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
12762 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
d7f62e82
SM
12763 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
12764 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
12765 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
12766 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
12767 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
12768 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
12769 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
12770 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
7d67af2c 12771 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
d7f62e82
SM
12772 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
12773 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
16340eab 12774 BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
d7f62e82 12775 BUILD_BUG_SQE_ELEM(42, __u16, personality);
7d67af2c 12776 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
b9445598 12777 BUILD_BUG_SQE_ELEM(44, __u32, file_index);
e9621e2b 12778 BUILD_BUG_SQE_ELEM(48, __u64, addr3);
d7f62e82 12779
b0d658ec
PB
12780 BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
12781 sizeof(struct io_uring_rsrc_update));
12782 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
12783 sizeof(struct io_uring_rsrc_update2));
90499ad0
PB
12784
12785 /* ->buf_index is u16 */
12786 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
9cfc7e94 12787 BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
90499ad0 12788
b0d658ec
PB
12789 /* should fit into one byte */
12790 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
68fe256a
PB
12791 BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
12792 BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
b0d658ec 12793
d3656344 12794 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
32c2d33e 12795 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
16340eab 12796
3a4b89a2
JA
12797 BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
12798
ee692a21
JA
12799 BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
12800
91f245d5
JA
12801 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
12802 SLAB_ACCOUNT);
2b188cc1
JA
12803 return 0;
12804};
12805__initcall(io_uring_init);