fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/mmu_context.h>
  59 #include <linux/percpu.h>
  60 #include <linux/slab.h>
  61 #include <linux/kthread.h>
  62 #include <linux/blkdev.h>
  63 #include <linux/bvec.h>
  64 #include <linux/net.h>
  65 #include <net/sock.h>
  66 #include <net/af_unix.h>
  67 #include <net/scm.h>
  68 #include <linux/anon_inodes.h>
  69 #include <linux/sched/mm.h>
  70 #include <linux/uaccess.h>
  71 #include <linux/nospec.h>
  72 #include <linux/sizes.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/highmem.h>
  75 #include <linux/namei.h>
  76 #include <linux/fsnotify.h>
  77 #include <linux/fadvise.h>
  78 #include <linux/eventpoll.h>
  79 #include <linux/fs_struct.h>
  80 #include <linux/splice.h>
  81 #include <linux/task_work.h>
  82
  83 #define CREATE_TRACE_POINTS
  84 #include <trace/events/io_uring.h>
  85
  86 #include <uapi/linux/io_uring.h>
  87
  88 #include "internal.h"
  89 #include "io-wq.h"
  90
  91 #define IORING_MAX_ENTRIES      32768
  92 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  93
  94 /*
  95  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  96  */
  97 #define IORING_FILE_TABLE_SHIFT 9
  98 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  99 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
 100 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 101
 102 struct io_uring {
 103         u32 head ____cacheline_aligned_in_smp;
 104         u32 tail ____cacheline_aligned_in_smp;
 105 };
 106
 107 /*
 108  * This data is shared with the application through the mmap at offsets
 109  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 110  *
 111  * The offsets to the member fields are published through struct
 112  * io_sqring_offsets when calling io_uring_setup.
 113  */
 114 struct io_rings {
 115         /*
 116          * Head and tail offsets into the ring; the offsets need to be
 117          * masked to get valid indices.
 118          *
 119          * The kernel controls head of the sq ring and the tail of the cq ring,
 120          * and the application controls tail of the sq ring and the head of the
 121          * cq ring.
 122          */
 123         struct io_uring         sq, cq;
 124         /*
 125          * Bitmasks to apply to head and tail offsets (constant, equals
 126          * ring_entries - 1)
 127          */
 128         u32                     sq_ring_mask, cq_ring_mask;
 129         /* Ring sizes (constant, power of 2) */
 130         u32                     sq_ring_entries, cq_ring_entries;
 131         /*
 132          * Number of invalid entries dropped by the kernel due to
 133          * invalid index stored in array
 134          *
 135          * Written by the kernel, shouldn't be modified by the
 136          * application (i.e. get number of "new events" by comparing to
 137          * cached value).
 138          *
 139          * After a new SQ head value was read by the application this
 140          * counter includes all submissions that were dropped reaching
 141          * the new SQ head (and possibly more).
 142          */
 143         u32                     sq_dropped;
 144         /*
 145          * Runtime flags
 146          *
 147          * Written by the kernel, shouldn't be modified by the
 148          * application.
 149          *
 150          * The application needs a full memory barrier before checking
 151          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 152          */
 153         u32                     sq_flags;
 154         /*
 155          * Number of completion events lost because the queue was full;
 156          * this should be avoided by the application by making sure
 157          * there are not more requests pending than there is space in
 158          * the completion queue.
 159          *
 160          * Written by the kernel, shouldn't be modified by the
 161          * application (i.e. get number of "new events" by comparing to
 162          * cached value).
 163          *
 164          * As completion events come in out of order this counter is not
 165          * ordered with any other data.
 166          */
 167         u32                     cq_overflow;
 168         /*
 169          * Ring buffer of completion events.
 170          *
 171          * The kernel writes completion events fresh every time they are
 172          * produced, so the application is allowed to modify pending
 173          * entries.
 174          */
 175         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 176 };
 177
 178 struct io_mapped_ubuf {
 179         u64             ubuf;
 180         size_t          len;
 181         struct          bio_vec *bvec;
 182         unsigned int    nr_bvecs;
 183 };
 184
 185 struct fixed_file_table {
 186         struct file             **files;
 187 };
 188
 189 struct fixed_file_ref_node {
 190         struct percpu_ref               refs;
 191         struct list_head                node;
 192         struct list_head                file_list;
 193         struct fixed_file_data          *file_data;
 194         struct work_struct              work;
 195 };
 196
 197 struct fixed_file_data {
 198         struct fixed_file_table         *table;
 199         struct io_ring_ctx              *ctx;
 200
 201         struct percpu_ref               *cur_refs;
 202         struct percpu_ref               refs;
 203         struct completion               done;
 204         struct list_head                ref_list;
 205         spinlock_t                      lock;
 206 };
 207
 208 struct io_buffer {
 209         struct list_head list;
 210         __u64 addr;
 211         __s32 len;
 212         __u16 bid;
 213 };
 214
 215 struct io_ring_ctx {
 216         struct {
 217                 struct percpu_ref       refs;
 218         } ____cacheline_aligned_in_smp;
 219
 220         struct {
 221                 unsigned int            flags;
 222                 unsigned int            compat: 1;
 223                 unsigned int            account_mem: 1;
 224                 unsigned int            cq_overflow_flushed: 1;
 225                 unsigned int            drain_next: 1;
 226                 unsigned int            eventfd_async: 1;
 227
 228                 /*
 229                  * Ring buffer of indices into array of io_uring_sqe, which is
 230                  * mmapped by the application using the IORING_OFF_SQES offset.
 231                  *
 232                  * This indirection could e.g. be used to assign fixed
 233                  * io_uring_sqe entries to operations and only submit them to
 234                  * the queue when needed.
 235                  *
 236                  * The kernel modifies neither the indices array nor the entries
 237                  * array.
 238                  */
 239                 u32                     *sq_array;
 240                 unsigned                cached_sq_head;
 241                 unsigned                sq_entries;
 242                 unsigned                sq_mask;
 243                 unsigned                sq_thread_idle;
 244                 unsigned                cached_sq_dropped;
 245                 atomic_t                cached_cq_overflow;
 246                 unsigned long           sq_check_overflow;
 247
 248                 struct list_head        defer_list;
 249                 struct list_head        timeout_list;
 250                 struct list_head        cq_overflow_list;
 251
 252                 wait_queue_head_t       inflight_wait;
 253                 struct io_uring_sqe     *sq_sqes;
 254         } ____cacheline_aligned_in_smp;
 255
 256         struct io_rings *rings;
 257
 258         /* IO offload */
 259         struct io_wq            *io_wq;
 260         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 261         struct mm_struct        *sqo_mm;
 262         wait_queue_head_t       sqo_wait;
 263
 264         /*
 265          * If used, fixed file set. Writers must ensure that ->refs is dead,
 266          * readers must ensure that ->refs is alive as long as the file* is
 267          * used. Only updated through io_uring_register(2).
 268          */
 269         struct fixed_file_data  *file_data;
 270         unsigned                nr_user_files;
 271         int                     ring_fd;
 272         struct file             *ring_file;
 273
 274         /* if used, fixed mapped user buffers */
 275         unsigned                nr_user_bufs;
 276         struct io_mapped_ubuf   *user_bufs;
 277
 278         struct user_struct      *user;
 279
 280         const struct cred       *creds;
 281
 282         /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
 283         struct completion       *completions;
 284
 285         /* if all else fails... */
 286         struct io_kiocb         *fallback_req;
 287
 288 #if defined(CONFIG_UNIX)
 289         struct socket           *ring_sock;
 290 #endif
 291
 292         struct idr              io_buffer_idr;
 293
 294         struct idr              personality_idr;
 295
 296         struct {
 297                 unsigned                cached_cq_tail;
 298                 unsigned                cq_entries;
 299                 unsigned                cq_mask;
 300                 atomic_t                cq_timeouts;
 301                 unsigned long           cq_check_overflow;
 302                 struct wait_queue_head  cq_wait;
 303                 struct fasync_struct    *cq_fasync;
 304                 struct eventfd_ctx      *cq_ev_fd;
 305         } ____cacheline_aligned_in_smp;
 306
 307         struct {
 308                 struct mutex            uring_lock;
 309                 wait_queue_head_t       wait;
 310         } ____cacheline_aligned_in_smp;
 311
 312         struct {
 313                 spinlock_t              completion_lock;
 314
 315                 /*
 316                  * ->poll_list is protected by the ctx->uring_lock for
 317                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 318                  * For SQPOLL, only the single threaded io_sq_thread() will
 319                  * manipulate the list, hence no extra locking is needed there.
 320                  */
 321                 struct list_head        poll_list;
 322                 struct hlist_head       *cancel_hash;
 323                 unsigned                cancel_hash_bits;
 324                 bool                    poll_multi_file;
 325
 326                 spinlock_t              inflight_lock;
 327                 struct list_head        inflight_list;
 328         } ____cacheline_aligned_in_smp;
 329
 330         struct work_struct              exit_work;
 331 };
 332
 333 /*
 334  * First field must be the file pointer in all the
 335  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 336  */
 337 struct io_poll_iocb {
 338         struct file                     *file;
 339         union {
 340                 struct wait_queue_head  *head;
 341                 u64                     addr;
 342         };
 343         __poll_t                        events;
 344         bool                            done;
 345         bool                            canceled;
 346         struct wait_queue_entry         wait;
 347 };
 348
 349 struct io_close {
 350         struct file                     *file;
 351         struct file                     *put_file;
 352         int                             fd;
 353 };
 354
 355 struct io_timeout_data {
 356         struct io_kiocb                 *req;
 357         struct hrtimer                  timer;
 358         struct timespec64               ts;
 359         enum hrtimer_mode               mode;
 360 };
 361
 362 struct io_accept {
 363         struct file                     *file;
 364         struct sockaddr __user          *addr;
 365         int __user                      *addr_len;
 366         int                             flags;
 367         unsigned long                   nofile;
 368 };
 369
 370 struct io_sync {
 371         struct file                     *file;
 372         loff_t                          len;
 373         loff_t                          off;
 374         int                             flags;
 375         int                             mode;
 376 };
 377
 378 struct io_cancel {
 379         struct file                     *file;
 380         u64                             addr;
 381 };
 382
 383 struct io_timeout {
 384         struct file                     *file;
 385         u64                             addr;
 386         int                             flags;
 387         u32                             count;
 388 };
 389
 390 struct io_rw {
 391         /* NOTE: kiocb has the file as the first member, so don't do it here */
 392         struct kiocb                    kiocb;
 393         u64                             addr;
 394         u64                             len;
 395 };
 396
 397 struct io_connect {
 398         struct file                     *file;
 399         struct sockaddr __user          *addr;
 400         int                             addr_len;
 401 };
 402
 403 struct io_sr_msg {
 404         struct file                     *file;
 405         union {
 406                 struct user_msghdr __user *msg;
 407                 void __user             *buf;
 408         };
 409         int                             msg_flags;
 410         int                             bgid;
 411         size_t                          len;
 412         struct io_buffer                *kbuf;
 413 };
 414
 415 struct io_open {
 416         struct file                     *file;
 417         int                             dfd;
 418         union {
 419                 unsigned                mask;
 420         };
 421         struct filename                 *filename;
 422         struct statx __user             *buffer;
 423         struct open_how                 how;
 424         unsigned long                   nofile;
 425 };
 426
 427 struct io_files_update {
 428         struct file                     *file;
 429         u64                             arg;
 430         u32                             nr_args;
 431         u32                             offset;
 432 };
 433
 434 struct io_fadvise {
 435         struct file                     *file;
 436         u64                             offset;
 437         u32                             len;
 438         u32                             advice;
 439 };
 440
 441 struct io_madvise {
 442         struct file                     *file;
 443         u64                             addr;
 444         u32                             len;
 445         u32                             advice;
 446 };
 447
 448 struct io_epoll {
 449         struct file                     *file;
 450         int                             epfd;
 451         int                             op;
 452         int                             fd;
 453         struct epoll_event              event;
 454 };
 455
 456 struct io_splice {
 457         struct file                     *file_out;
 458         struct file                     *file_in;
 459         loff_t                          off_out;
 460         loff_t                          off_in;
 461         u64                             len;
 462         unsigned int                    flags;
 463 };
 464
 465 struct io_provide_buf {
 466         struct file                     *file;
 467         __u64                           addr;
 468         __s32                           len;
 469         __u32                           bgid;
 470         __u16                           nbufs;
 471         __u16                           bid;
 472 };
 473
 474 struct io_async_connect {
 475         struct sockaddr_storage         address;
 476 };
 477
 478 struct io_async_msghdr {
 479         struct iovec                    fast_iov[UIO_FASTIOV];
 480         struct iovec                    *iov;
 481         struct sockaddr __user          *uaddr;
 482         struct msghdr                   msg;
 483         struct sockaddr_storage         addr;
 484 };
 485
 486 struct io_async_rw {
 487         struct iovec                    fast_iov[UIO_FASTIOV];
 488         struct iovec                    *iov;
 489         ssize_t                         nr_segs;
 490         ssize_t                         size;
 491 };
 492
 493 struct io_async_ctx {
 494         union {
 495                 struct io_async_rw      rw;
 496                 struct io_async_msghdr  msg;
 497                 struct io_async_connect connect;
 498                 struct io_timeout_data  timeout;
 499         };
 500 };
 501
 502 enum {
 503         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 504         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 505         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 506         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 507         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 508         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 509
 510         REQ_F_LINK_HEAD_BIT,
 511         REQ_F_LINK_NEXT_BIT,
 512         REQ_F_FAIL_LINK_BIT,
 513         REQ_F_INFLIGHT_BIT,
 514         REQ_F_CUR_POS_BIT,
 515         REQ_F_NOWAIT_BIT,
 516         REQ_F_IOPOLL_COMPLETED_BIT,
 517         REQ_F_LINK_TIMEOUT_BIT,
 518         REQ_F_TIMEOUT_BIT,
 519         REQ_F_ISREG_BIT,
 520         REQ_F_MUST_PUNT_BIT,
 521         REQ_F_TIMEOUT_NOSEQ_BIT,
 522         REQ_F_COMP_LOCKED_BIT,
 523         REQ_F_NEED_CLEANUP_BIT,
 524         REQ_F_OVERFLOW_BIT,
 525         REQ_F_POLLED_BIT,
 526         REQ_F_BUFFER_SELECTED_BIT,
 527         REQ_F_NO_FILE_TABLE_BIT,
 528
 529         /* not a real bit, just to check we're not overflowing the space */
 530         __REQ_F_LAST_BIT,
 531 };
 532
 533 enum {
 534         /* ctx owns file */
 535         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 536         /* drain existing IO first */
 537         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 538         /* linked sqes */
 539         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 540         /* doesn't sever on completion < 0 */
 541         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 542         /* IOSQE_ASYNC */
 543         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 544         /* IOSQE_BUFFER_SELECT */
 545         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 546
 547         /* head of a link */
 548         REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
 549         /* already grabbed next link */
 550         REQ_F_LINK_NEXT         = BIT(REQ_F_LINK_NEXT_BIT),
 551         /* fail rest of links */
 552         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 553         /* on inflight list */
 554         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 555         /* read/write uses file position */
 556         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 557         /* must not punt to workers */
 558         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 559         /* polled IO has completed */
 560         REQ_F_IOPOLL_COMPLETED  = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
 561         /* has linked timeout */
 562         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 563         /* timeout request */
 564         REQ_F_TIMEOUT           = BIT(REQ_F_TIMEOUT_BIT),
 565         /* regular file */
 566         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 567         /* must be punted even for NONBLOCK */
 568         REQ_F_MUST_PUNT         = BIT(REQ_F_MUST_PUNT_BIT),
 569         /* no timeout sequence */
 570         REQ_F_TIMEOUT_NOSEQ     = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
 571         /* completion under lock */
 572         REQ_F_COMP_LOCKED       = BIT(REQ_F_COMP_LOCKED_BIT),
 573         /* needs cleanup */
 574         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 575         /* in overflow list */
 576         REQ_F_OVERFLOW          = BIT(REQ_F_OVERFLOW_BIT),
 577         /* already went through poll handler */
 578         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 579         /* buffer already selected */
 580         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 581         /* doesn't need file table for this request */
 582         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 583 };
 584
 585 struct async_poll {
 586         struct io_poll_iocb     poll;
 587         struct io_wq_work       work;
 588 };
 589
 590 /*
 591  * NOTE! Each of the iocb union members has the file pointer
 592  * as the first entry in their struct definition. So you can
 593  * access the file pointer through any of the sub-structs,
 594  * or directly as just 'ki_filp' in this struct.
 595  */
 596 struct io_kiocb {
 597         union {
 598                 struct file             *file;
 599                 struct io_rw            rw;
 600                 struct io_poll_iocb     poll;
 601                 struct io_accept        accept;
 602                 struct io_sync          sync;
 603                 struct io_cancel        cancel;
 604                 struct io_timeout       timeout;
 605                 struct io_connect       connect;
 606                 struct io_sr_msg        sr_msg;
 607                 struct io_open          open;
 608                 struct io_close         close;
 609                 struct io_files_update  files_update;
 610                 struct io_fadvise       fadvise;
 611                 struct io_madvise       madvise;
 612                 struct io_epoll         epoll;
 613                 struct io_splice        splice;
 614                 struct io_provide_buf   pbuf;
 615         };
 616
 617         struct io_async_ctx             *io;
 618         int                             cflags;
 619         bool                            needs_fixed_file;
 620         u8                              opcode;
 621
 622         u16                             buf_index;
 623
 624         struct io_ring_ctx      *ctx;
 625         struct list_head        list;
 626         unsigned int            flags;
 627         refcount_t              refs;
 628         struct task_struct      *task;
 629         unsigned long           fsize;
 630         u64                     user_data;
 631         u32                     result;
 632         u32                     sequence;
 633
 634         struct list_head        link_list;
 635
 636         struct list_head        inflight_entry;
 637
 638         struct percpu_ref       *fixed_file_refs;
 639
 640         union {
 641                 /*
 642                  * Only commands that never go async can use the below fields,
 643                  * obviously. Right now only IORING_OP_POLL_ADD uses them, and
 644                  * async armed poll handlers for regular commands. The latter
 645                  * restore the work, if needed.
 646                  */
 647                 struct {
 648                         struct callback_head    task_work;
 649                         struct hlist_node       hash_node;
 650                         struct async_poll       *apoll;
 651                 };
 652                 struct io_wq_work       work;
 653         };
 654 };
 655
 656 #define IO_PLUG_THRESHOLD               2
 657 #define IO_IOPOLL_BATCH                 8
 658
 659 struct io_submit_state {
 660         struct blk_plug         plug;
 661
 662         /*
 663          * io_kiocb alloc cache
 664          */
 665         void                    *reqs[IO_IOPOLL_BATCH];
 666         unsigned int            free_reqs;
 667
 668         /*
 669          * File reference cache
 670          */
 671         struct file             *file;
 672         unsigned int            fd;
 673         unsigned int            has_refs;
 674         unsigned int            used_refs;
 675         unsigned int            ios_left;
 676 };
 677
 678 struct io_op_def {
 679         /* needs req->io allocated for deferral/async */
 680         unsigned                async_ctx : 1;
 681         /* needs current->mm setup, does mm access */
 682         unsigned                needs_mm : 1;
 683         /* needs req->file assigned */
 684         unsigned                needs_file : 1;
 685         /* hash wq insertion if file is a regular file */
 686         unsigned                hash_reg_file : 1;
 687         /* unbound wq insertion if file is a non-regular file */
 688         unsigned                unbound_nonreg_file : 1;
 689         /* opcode is not supported by this kernel */
 690         unsigned                not_supported : 1;
 691         /* needs file table */
 692         unsigned                file_table : 1;
 693         /* needs ->fs */
 694         unsigned                needs_fs : 1;
 695         /* set if opcode supports polled "wait" */
 696         unsigned                pollin : 1;
 697         unsigned                pollout : 1;
 698         /* op supports buffer selection */
 699         unsigned                buffer_select : 1;
 700 };
 701
 702 static const struct io_op_def io_op_defs[] = {
 703         [IORING_OP_NOP] = {},
 704         [IORING_OP_READV] = {
 705                 .async_ctx              = 1,
 706                 .needs_mm               = 1,
 707                 .needs_file             = 1,
 708                 .unbound_nonreg_file    = 1,
 709                 .pollin                 = 1,
 710                 .buffer_select          = 1,
 711         },
 712         [IORING_OP_WRITEV] = {
 713                 .async_ctx              = 1,
 714                 .needs_mm               = 1,
 715                 .needs_file             = 1,
 716                 .hash_reg_file          = 1,
 717                 .unbound_nonreg_file    = 1,
 718                 .pollout                = 1,
 719         },
 720         [IORING_OP_FSYNC] = {
 721                 .needs_file             = 1,
 722         },
 723         [IORING_OP_READ_FIXED] = {
 724                 .needs_file             = 1,
 725                 .unbound_nonreg_file    = 1,
 726                 .pollin                 = 1,
 727         },
 728         [IORING_OP_WRITE_FIXED] = {
 729                 .needs_file             = 1,
 730                 .hash_reg_file          = 1,
 731                 .unbound_nonreg_file    = 1,
 732                 .pollout                = 1,
 733         },
 734         [IORING_OP_POLL_ADD] = {
 735                 .needs_file             = 1,
 736                 .unbound_nonreg_file    = 1,
 737         },
 738         [IORING_OP_POLL_REMOVE] = {},
 739         [IORING_OP_SYNC_FILE_RANGE] = {
 740                 .needs_file             = 1,
 741         },
 742         [IORING_OP_SENDMSG] = {
 743                 .async_ctx              = 1,
 744                 .needs_mm               = 1,
 745                 .needs_file             = 1,
 746                 .unbound_nonreg_file    = 1,
 747                 .needs_fs               = 1,
 748                 .pollout                = 1,
 749         },
 750         [IORING_OP_RECVMSG] = {
 751                 .async_ctx              = 1,
 752                 .needs_mm               = 1,
 753                 .needs_file             = 1,
 754                 .unbound_nonreg_file    = 1,
 755                 .needs_fs               = 1,
 756                 .pollin                 = 1,
 757                 .buffer_select          = 1,
 758         },
 759         [IORING_OP_TIMEOUT] = {
 760                 .async_ctx              = 1,
 761                 .needs_mm               = 1,
 762         },
 763         [IORING_OP_TIMEOUT_REMOVE] = {},
 764         [IORING_OP_ACCEPT] = {
 765                 .needs_mm               = 1,
 766                 .needs_file             = 1,
 767                 .unbound_nonreg_file    = 1,
 768                 .file_table             = 1,
 769                 .pollin                 = 1,
 770         },
 771         [IORING_OP_ASYNC_CANCEL] = {},
 772         [IORING_OP_LINK_TIMEOUT] = {
 773                 .async_ctx              = 1,
 774                 .needs_mm               = 1,
 775         },
 776         [IORING_OP_CONNECT] = {
 777                 .async_ctx              = 1,
 778                 .needs_mm               = 1,
 779                 .needs_file             = 1,
 780                 .unbound_nonreg_file    = 1,
 781                 .pollout                = 1,
 782         },
 783         [IORING_OP_FALLOCATE] = {
 784                 .needs_file             = 1,
 785         },
 786         [IORING_OP_OPENAT] = {
 787                 .file_table             = 1,
 788                 .needs_fs               = 1,
 789         },
 790         [IORING_OP_CLOSE] = {
 791                 .needs_file             = 1,
 792                 .file_table             = 1,
 793         },
 794         [IORING_OP_FILES_UPDATE] = {
 795                 .needs_mm               = 1,
 796                 .file_table             = 1,
 797         },
 798         [IORING_OP_STATX] = {
 799                 .needs_mm               = 1,
 800                 .needs_fs               = 1,
 801                 .file_table             = 1,
 802         },
 803         [IORING_OP_READ] = {
 804                 .needs_mm               = 1,
 805                 .needs_file             = 1,
 806                 .unbound_nonreg_file    = 1,
 807                 .pollin                 = 1,
 808                 .buffer_select          = 1,
 809         },
 810         [IORING_OP_WRITE] = {
 811                 .needs_mm               = 1,
 812                 .needs_file             = 1,
 813                 .unbound_nonreg_file    = 1,
 814                 .pollout                = 1,
 815         },
 816         [IORING_OP_FADVISE] = {
 817                 .needs_file             = 1,
 818         },
 819         [IORING_OP_MADVISE] = {
 820                 .needs_mm               = 1,
 821         },
 822         [IORING_OP_SEND] = {
 823                 .needs_mm               = 1,
 824                 .needs_file             = 1,
 825                 .unbound_nonreg_file    = 1,
 826                 .pollout                = 1,
 827         },
 828         [IORING_OP_RECV] = {
 829                 .needs_mm               = 1,
 830                 .needs_file             = 1,
 831                 .unbound_nonreg_file    = 1,
 832                 .pollin                 = 1,
 833                 .buffer_select          = 1,
 834         },
 835         [IORING_OP_OPENAT2] = {
 836                 .file_table             = 1,
 837                 .needs_fs               = 1,
 838         },
 839         [IORING_OP_EPOLL_CTL] = {
 840                 .unbound_nonreg_file    = 1,
 841                 .file_table             = 1,
 842         },
 843         [IORING_OP_SPLICE] = {
 844                 .needs_file             = 1,
 845                 .hash_reg_file          = 1,
 846                 .unbound_nonreg_file    = 1,
 847         },
 848         [IORING_OP_PROVIDE_BUFFERS] = {},
 849         [IORING_OP_REMOVE_BUFFERS] = {},
 850 };
 851
 852 static void io_wq_submit_work(struct io_wq_work **workptr);
 853 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 854 static void io_put_req(struct io_kiocb *req);
 855 static void __io_double_put_req(struct io_kiocb *req);
 856 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 857 static void io_queue_linked_timeout(struct io_kiocb *req);
 858 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 859                                  struct io_uring_files_update *ip,
 860                                  unsigned nr_args);
 861 static int io_grab_files(struct io_kiocb *req);
 862 static void io_cleanup_req(struct io_kiocb *req);
 863 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 864                        int fd, struct file **out_file, bool fixed);
 865 static void __io_queue_sqe(struct io_kiocb *req,
 866                            const struct io_uring_sqe *sqe);
 867
 868 static struct kmem_cache *req_cachep;
 869
 870 static const struct file_operations io_uring_fops;
 871
 872 struct sock *io_uring_get_socket(struct file *file)
 873 {
 874 #if defined(CONFIG_UNIX)
 875         if (file->f_op == &io_uring_fops) {
 876                 struct io_ring_ctx *ctx = file->private_data;
 877
 878                 return ctx->ring_sock->sk;
 879         }
 880 #endif
 881         return NULL;
 882 }
 883 EXPORT_SYMBOL(io_uring_get_socket);
 884
 885 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 886 {
 887         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 888
 889         complete(&ctx->completions[0]);
 890 }
 891
 892 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 893 {
 894         struct io_ring_ctx *ctx;
 895         int hash_bits;
 896
 897         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 898         if (!ctx)
 899                 return NULL;
 900
 901         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
 902         if (!ctx->fallback_req)
 903                 goto err;
 904
 905         ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
 906         if (!ctx->completions)
 907                 goto err;
 908
 909         /*
 910          * Use 5 bits less than the max cq entries, that should give us around
 911          * 32 entries per hash list if totally full and uniformly spread.
 912          */
 913         hash_bits = ilog2(p->cq_entries);
 914         hash_bits -= 5;
 915         if (hash_bits <= 0)
 916                 hash_bits = 1;
 917         ctx->cancel_hash_bits = hash_bits;
 918         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 919                                         GFP_KERNEL);
 920         if (!ctx->cancel_hash)
 921                 goto err;
 922         __hash_init(ctx->cancel_hash, 1U << hash_bits);
 923
 924         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 925                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 926                 goto err;
 927
 928         ctx->flags = p->flags;
 929         init_waitqueue_head(&ctx->sqo_wait);
 930         init_waitqueue_head(&ctx->cq_wait);
 931         INIT_LIST_HEAD(&ctx->cq_overflow_list);
 932         init_completion(&ctx->completions[0]);
 933         init_completion(&ctx->completions[1]);
 934         idr_init(&ctx->io_buffer_idr);
 935         idr_init(&ctx->personality_idr);
 936         mutex_init(&ctx->uring_lock);
 937         init_waitqueue_head(&ctx->wait);
 938         spin_lock_init(&ctx->completion_lock);
 939         INIT_LIST_HEAD(&ctx->poll_list);
 940         INIT_LIST_HEAD(&ctx->defer_list);
 941         INIT_LIST_HEAD(&ctx->timeout_list);
 942         init_waitqueue_head(&ctx->inflight_wait);
 943         spin_lock_init(&ctx->inflight_lock);
 944         INIT_LIST_HEAD(&ctx->inflight_list);
 945         return ctx;
 946 err:
 947         if (ctx->fallback_req)
 948                 kmem_cache_free(req_cachep, ctx->fallback_req);
 949         kfree(ctx->completions);
 950         kfree(ctx->cancel_hash);
 951         kfree(ctx);
 952         return NULL;
 953 }
 954
 955 static inline bool __req_need_defer(struct io_kiocb *req)
 956 {
 957         struct io_ring_ctx *ctx = req->ctx;
 958
 959         return req->sequence != ctx->cached_cq_tail
 960                                 + atomic_read(&ctx->cached_cq_overflow);
 961 }
 962
 963 static inline bool req_need_defer(struct io_kiocb *req)
 964 {
 965         if (unlikely(req->flags & REQ_F_IO_DRAIN))
 966                 return __req_need_defer(req);
 967
 968         return false;
 969 }
 970
 971 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 972 {
 973         struct io_kiocb *req;
 974
 975         req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
 976         if (req && !req_need_defer(req)) {
 977                 list_del_init(&req->list);
 978                 return req;
 979         }
 980
 981         return NULL;
 982 }
 983
 984 static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 985 {
 986         struct io_kiocb *req;
 987
 988         req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
 989         if (req) {
 990                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
 991                         return NULL;
 992                 if (!__req_need_defer(req)) {
 993                         list_del_init(&req->list);
 994                         return req;
 995                 }
 996         }
 997
 998         return NULL;
 999 }
1000
1001 static void __io_commit_cqring(struct io_ring_ctx *ctx)
1002 {
1003         struct io_rings *rings = ctx->rings;
1004
1005         /* order cqe stores with ring update */
1006         smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
1007
1008         if (wq_has_sleeper(&ctx->cq_wait)) {
1009                 wake_up_interruptible(&ctx->cq_wait);
1010                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1011         }
1012 }
1013
1014 static inline void io_req_work_grab_env(struct io_kiocb *req,
1015                                         const struct io_op_def *def)
1016 {
1017         if (!req->work.mm && def->needs_mm) {
1018                 mmgrab(current->mm);
1019                 req->work.mm = current->mm;
1020         }
1021         if (!req->work.creds)
1022                 req->work.creds = get_current_cred();
1023         if (!req->work.fs && def->needs_fs) {
1024                 spin_lock(&current->fs->lock);
1025                 if (!current->fs->in_exec) {
1026                         req->work.fs = current->fs;
1027                         req->work.fs->users++;
1028                 } else {
1029                         req->work.flags |= IO_WQ_WORK_CANCEL;
1030                 }
1031                 spin_unlock(&current->fs->lock);
1032         }
1033         if (!req->work.task_pid)
1034                 req->work.task_pid = task_pid_vnr(current);
1035 }
1036
1037 static inline void io_req_work_drop_env(struct io_kiocb *req)
1038 {
1039         if (req->work.mm) {
1040                 mmdrop(req->work.mm);
1041                 req->work.mm = NULL;
1042         }
1043         if (req->work.creds) {
1044                 put_cred(req->work.creds);
1045                 req->work.creds = NULL;
1046         }
1047         if (req->work.fs) {
1048                 struct fs_struct *fs = req->work.fs;
1049
1050                 spin_lock(&req->work.fs->lock);
1051                 if (--fs->users)
1052                         fs = NULL;
1053                 spin_unlock(&req->work.fs->lock);
1054                 if (fs)
1055                         free_fs_struct(fs);
1056         }
1057 }
1058
1059 static inline void io_prep_async_work(struct io_kiocb *req,
1060                                       struct io_kiocb **link)
1061 {
1062         const struct io_op_def *def = &io_op_defs[req->opcode];
1063
1064         if (req->flags & REQ_F_ISREG) {
1065                 if (def->hash_reg_file)
1066                         io_wq_hash_work(&req->work, file_inode(req->file));
1067         } else {
1068                 if (def->unbound_nonreg_file)
1069                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1070         }
1071
1072         io_req_work_grab_env(req, def);
1073
1074         *link = io_prep_linked_timeout(req);
1075 }
1076
1077 static inline void io_queue_async_work(struct io_kiocb *req)
1078 {
1079         struct io_ring_ctx *ctx = req->ctx;
1080         struct io_kiocb *link;
1081
1082         io_prep_async_work(req, &link);
1083
1084         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1085                                         &req->work, req->flags);
1086         io_wq_enqueue(ctx->io_wq, &req->work);
1087
1088         if (link)
1089                 io_queue_linked_timeout(link);
1090 }
1091
1092 static void io_kill_timeout(struct io_kiocb *req)
1093 {
1094         int ret;
1095
1096         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1097         if (ret != -1) {
1098                 atomic_inc(&req->ctx->cq_timeouts);
1099                 list_del_init(&req->list);
1100                 req->flags |= REQ_F_COMP_LOCKED;
1101                 io_cqring_fill_event(req, 0);
1102                 io_put_req(req);
1103         }
1104 }
1105
1106 static void io_kill_timeouts(struct io_ring_ctx *ctx)
1107 {
1108         struct io_kiocb *req, *tmp;
1109
1110         spin_lock_irq(&ctx->completion_lock);
1111         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1112                 io_kill_timeout(req);
1113         spin_unlock_irq(&ctx->completion_lock);
1114 }
1115
1116 static void io_commit_cqring(struct io_ring_ctx *ctx)
1117 {
1118         struct io_kiocb *req;
1119
1120         while ((req = io_get_timeout_req(ctx)) != NULL)
1121                 io_kill_timeout(req);
1122
1123         __io_commit_cqring(ctx);
1124
1125         while ((req = io_get_deferred_req(ctx)) != NULL)
1126                 io_queue_async_work(req);
1127 }
1128
1129 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1130 {
1131         struct io_rings *rings = ctx->rings;
1132         unsigned tail;
1133
1134         tail = ctx->cached_cq_tail;
1135         /*
1136          * writes to the cq entry need to come after reading head; the
1137          * control dependency is enough as we're using WRITE_ONCE to
1138          * fill the cq entry
1139          */
1140         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
1141                 return NULL;
1142
1143         ctx->cached_cq_tail++;
1144         return &rings->cqes[tail & ctx->cq_mask];
1145 }
1146
1147 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1148 {
1149         if (!ctx->cq_ev_fd)
1150                 return false;
1151         if (!ctx->eventfd_async)
1152                 return true;
1153         return io_wq_current_is_worker();
1154 }
1155
1156 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1157 {
1158         if (waitqueue_active(&ctx->wait))
1159                 wake_up(&ctx->wait);
1160         if (waitqueue_active(&ctx->sqo_wait))
1161                 wake_up(&ctx->sqo_wait);
1162         if (io_should_trigger_evfd(ctx))
1163                 eventfd_signal(ctx->cq_ev_fd, 1);
1164 }
1165
1166 /* Returns true if there are no backlogged entries after the flush */
1167 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1168 {
1169         struct io_rings *rings = ctx->rings;
1170         struct io_uring_cqe *cqe;
1171         struct io_kiocb *req;
1172         unsigned long flags;
1173         LIST_HEAD(list);
1174
1175         if (!force) {
1176                 if (list_empty_careful(&ctx->cq_overflow_list))
1177                         return true;
1178                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1179                     rings->cq_ring_entries))
1180                         return false;
1181         }
1182
1183         spin_lock_irqsave(&ctx->completion_lock, flags);
1184
1185         /* if force is set, the ring is going away. always drop after that */
1186         if (force)
1187                 ctx->cq_overflow_flushed = 1;
1188
1189         cqe = NULL;
1190         while (!list_empty(&ctx->cq_overflow_list)) {
1191                 cqe = io_get_cqring(ctx);
1192                 if (!cqe && !force)
1193                         break;
1194
1195                 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1196                                                 list);
1197                 list_move(&req->list, &list);
1198                 req->flags &= ~REQ_F_OVERFLOW;
1199                 if (cqe) {
1200                         WRITE_ONCE(cqe->user_data, req->user_data);
1201                         WRITE_ONCE(cqe->res, req->result);
1202                         WRITE_ONCE(cqe->flags, req->cflags);
1203                 } else {
1204                         WRITE_ONCE(ctx->rings->cq_overflow,
1205                                 atomic_inc_return(&ctx->cached_cq_overflow));
1206                 }
1207         }
1208
1209         io_commit_cqring(ctx);
1210         if (cqe) {
1211                 clear_bit(0, &ctx->sq_check_overflow);
1212                 clear_bit(0, &ctx->cq_check_overflow);
1213         }
1214         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1215         io_cqring_ev_posted(ctx);
1216
1217         while (!list_empty(&list)) {
1218                 req = list_first_entry(&list, struct io_kiocb, list);
1219                 list_del(&req->list);
1220                 io_put_req(req);
1221         }
1222
1223         return cqe != NULL;
1224 }
1225
1226 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1227 {
1228         struct io_ring_ctx *ctx = req->ctx;
1229         struct io_uring_cqe *cqe;
1230
1231         trace_io_uring_complete(ctx, req->user_data, res);
1232
1233         /*
1234          * If we can't get a cq entry, userspace overflowed the
1235          * submission (by quite a lot). Increment the overflow count in
1236          * the ring.
1237          */
1238         cqe = io_get_cqring(ctx);
1239         if (likely(cqe)) {
1240                 WRITE_ONCE(cqe->user_data, req->user_data);
1241                 WRITE_ONCE(cqe->res, res);
1242                 WRITE_ONCE(cqe->flags, cflags);
1243         } else if (ctx->cq_overflow_flushed) {
1244                 WRITE_ONCE(ctx->rings->cq_overflow,
1245                                 atomic_inc_return(&ctx->cached_cq_overflow));
1246         } else {
1247                 if (list_empty(&ctx->cq_overflow_list)) {
1248                         set_bit(0, &ctx->sq_check_overflow);
1249                         set_bit(0, &ctx->cq_check_overflow);
1250                 }
1251                 req->flags |= REQ_F_OVERFLOW;
1252                 refcount_inc(&req->refs);
1253                 req->result = res;
1254                 req->cflags = cflags;
1255                 list_add_tail(&req->list, &ctx->cq_overflow_list);
1256         }
1257 }
1258
1259 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1260 {
1261         __io_cqring_fill_event(req, res, 0);
1262 }
1263
1264 static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
1265 {
1266         struct io_ring_ctx *ctx = req->ctx;
1267         unsigned long flags;
1268
1269         spin_lock_irqsave(&ctx->completion_lock, flags);
1270         __io_cqring_fill_event(req, res, cflags);
1271         io_commit_cqring(ctx);
1272         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1273
1274         io_cqring_ev_posted(ctx);
1275 }
1276
1277 static void io_cqring_add_event(struct io_kiocb *req, long res)
1278 {
1279         __io_cqring_add_event(req, res, 0);
1280 }
1281
1282 static inline bool io_is_fallback_req(struct io_kiocb *req)
1283 {
1284         return req == (struct io_kiocb *)
1285                         ((unsigned long) req->ctx->fallback_req & ~1UL);
1286 }
1287
1288 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1289 {
1290         struct io_kiocb *req;
1291
1292         req = ctx->fallback_req;
1293         if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
1294                 return req;
1295
1296         return NULL;
1297 }
1298
1299 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1300                                      struct io_submit_state *state)
1301 {
1302         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1303         struct io_kiocb *req;
1304
1305         if (!state) {
1306                 req = kmem_cache_alloc(req_cachep, gfp);
1307                 if (unlikely(!req))
1308                         goto fallback;
1309         } else if (!state->free_reqs) {
1310                 size_t sz;
1311                 int ret;
1312
1313                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1314                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1315
1316                 /*
1317                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1318                  * retry single alloc to be on the safe side.
1319                  */
1320                 if (unlikely(ret <= 0)) {
1321                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1322                         if (!state->reqs[0])
1323                                 goto fallback;
1324                         ret = 1;
1325                 }
1326                 state->free_reqs = ret - 1;
1327                 req = state->reqs[ret - 1];
1328         } else {
1329                 state->free_reqs--;
1330                 req = state->reqs[state->free_reqs];
1331         }
1332
1333         return req;
1334 fallback:
1335         return io_get_fallback_req(ctx);
1336 }
1337
1338 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1339                           bool fixed)
1340 {
1341         if (fixed)
1342                 percpu_ref_put(req->fixed_file_refs);
1343         else
1344                 fput(file);
1345 }
1346
1347 static void __io_req_aux_free(struct io_kiocb *req)
1348 {
1349         if (req->flags & REQ_F_NEED_CLEANUP)
1350                 io_cleanup_req(req);
1351
1352         kfree(req->io);
1353         if (req->file)
1354                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1355         if (req->task)
1356                 put_task_struct(req->task);
1357
1358         io_req_work_drop_env(req);
1359 }
1360
1361 static void __io_free_req(struct io_kiocb *req)
1362 {
1363         __io_req_aux_free(req);
1364
1365         if (req->flags & REQ_F_INFLIGHT) {
1366                 struct io_ring_ctx *ctx = req->ctx;
1367                 unsigned long flags;
1368
1369                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1370                 list_del(&req->inflight_entry);
1371                 if (waitqueue_active(&ctx->inflight_wait))
1372                         wake_up(&ctx->inflight_wait);
1373                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1374         }
1375
1376         percpu_ref_put(&req->ctx->refs);
1377         if (likely(!io_is_fallback_req(req)))
1378                 kmem_cache_free(req_cachep, req);
1379         else
1380                 clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
1381 }
1382
1383 struct req_batch {
1384         void *reqs[IO_IOPOLL_BATCH];
1385         int to_free;
1386         int need_iter;
1387 };
1388
1389 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1390 {
1391         if (!rb->to_free)
1392                 return;
1393         if (rb->need_iter) {
1394                 int i, inflight = 0;
1395                 unsigned long flags;
1396
1397                 for (i = 0; i < rb->to_free; i++) {
1398                         struct io_kiocb *req = rb->reqs[i];
1399
1400                         if (req->flags & REQ_F_INFLIGHT)
1401                                 inflight++;
1402                         __io_req_aux_free(req);
1403                 }
1404                 if (!inflight)
1405                         goto do_free;
1406
1407                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1408                 for (i = 0; i < rb->to_free; i++) {
1409                         struct io_kiocb *req = rb->reqs[i];
1410
1411                         if (req->flags & REQ_F_INFLIGHT) {
1412                                 list_del(&req->inflight_entry);
1413                                 if (!--inflight)
1414                                         break;
1415                         }
1416                 }
1417                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1418
1419                 if (waitqueue_active(&ctx->inflight_wait))
1420                         wake_up(&ctx->inflight_wait);
1421         }
1422 do_free:
1423         kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
1424         percpu_ref_put_many(&ctx->refs, rb->to_free);
1425         rb->to_free = rb->need_iter = 0;
1426 }
1427
1428 static bool io_link_cancel_timeout(struct io_kiocb *req)
1429 {
1430         struct io_ring_ctx *ctx = req->ctx;
1431         int ret;
1432
1433         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1434         if (ret != -1) {
1435                 io_cqring_fill_event(req, -ECANCELED);
1436                 io_commit_cqring(ctx);
1437                 req->flags &= ~REQ_F_LINK_HEAD;
1438                 io_put_req(req);
1439                 return true;
1440         }
1441
1442         return false;
1443 }
1444
1445 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1446 {
1447         struct io_ring_ctx *ctx = req->ctx;
1448         bool wake_ev = false;
1449
1450         /* Already got next link */
1451         if (req->flags & REQ_F_LINK_NEXT)
1452                 return;
1453
1454         /*
1455          * The list should never be empty when we are called here. But could
1456          * potentially happen if the chain is messed up, check to be on the
1457          * safe side.
1458          */
1459         while (!list_empty(&req->link_list)) {
1460                 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1461                                                 struct io_kiocb, link_list);
1462
1463                 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1464                              (nxt->flags & REQ_F_TIMEOUT))) {
1465                         list_del_init(&nxt->link_list);
1466                         wake_ev |= io_link_cancel_timeout(nxt);
1467                         req->flags &= ~REQ_F_LINK_TIMEOUT;
1468                         continue;
1469                 }
1470
1471                 list_del_init(&req->link_list);
1472                 if (!list_empty(&nxt->link_list))
1473                         nxt->flags |= REQ_F_LINK_HEAD;
1474                 *nxtptr = nxt;
1475                 break;
1476         }
1477
1478         req->flags |= REQ_F_LINK_NEXT;
1479         if (wake_ev)
1480                 io_cqring_ev_posted(ctx);
1481 }
1482
1483 /*
1484  * Called if REQ_F_LINK_HEAD is set, and we fail the head request
1485  */
1486 static void io_fail_links(struct io_kiocb *req)
1487 {
1488         struct io_ring_ctx *ctx = req->ctx;
1489         unsigned long flags;
1490
1491         spin_lock_irqsave(&ctx->completion_lock, flags);
1492
1493         while (!list_empty(&req->link_list)) {
1494                 struct io_kiocb *link = list_first_entry(&req->link_list,
1495                                                 struct io_kiocb, link_list);
1496
1497                 list_del_init(&link->link_list);
1498                 trace_io_uring_fail_link(req, link);
1499
1500                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
1501                     link->opcode == IORING_OP_LINK_TIMEOUT) {
1502                         io_link_cancel_timeout(link);
1503                 } else {
1504                         io_cqring_fill_event(link, -ECANCELED);
1505                         __io_double_put_req(link);
1506                 }
1507                 req->flags &= ~REQ_F_LINK_TIMEOUT;
1508         }
1509
1510         io_commit_cqring(ctx);
1511         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1512         io_cqring_ev_posted(ctx);
1513 }
1514
1515 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
1516 {
1517         if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1518                 return;
1519
1520         /*
1521          * If LINK is set, we have dependent requests in this chain. If we
1522          * didn't fail this request, queue the first one up, moving any other
1523          * dependencies to the next request. In case of failure, fail the rest
1524          * of the chain.
1525          */
1526         if (req->flags & REQ_F_FAIL_LINK) {
1527                 io_fail_links(req);
1528         } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1529                         REQ_F_LINK_TIMEOUT) {
1530                 struct io_ring_ctx *ctx = req->ctx;
1531                 unsigned long flags;
1532
1533                 /*
1534                  * If this is a timeout link, we could be racing with the
1535                  * timeout timer. Grab the completion lock for this case to
1536                  * protect against that.
1537                  */
1538                 spin_lock_irqsave(&ctx->completion_lock, flags);
1539                 io_req_link_next(req, nxt);
1540                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1541         } else {
1542                 io_req_link_next(req, nxt);
1543         }
1544 }
1545
1546 static void io_free_req(struct io_kiocb *req)
1547 {
1548         struct io_kiocb *nxt = NULL;
1549
1550         io_req_find_next(req, &nxt);
1551         __io_free_req(req);
1552
1553         if (nxt)
1554                 io_queue_async_work(nxt);
1555 }
1556
1557 static void io_link_work_cb(struct io_wq_work **workptr)
1558 {
1559         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
1560         struct io_kiocb *link;
1561
1562         link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
1563         io_queue_linked_timeout(link);
1564         io_wq_submit_work(workptr);
1565 }
1566
1567 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
1568 {
1569         struct io_kiocb *link;
1570         const struct io_op_def *def = &io_op_defs[nxt->opcode];
1571
1572         if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
1573                 io_wq_hash_work(&nxt->work, file_inode(nxt->file));
1574
1575         *workptr = &nxt->work;
1576         link = io_prep_linked_timeout(nxt);
1577         if (link)
1578                 nxt->work.func = io_link_work_cb;
1579 }
1580
1581 /*
1582  * Drop reference to request, return next in chain (if there is one) if this
1583  * was the last reference to this request.
1584  */
1585 __attribute__((nonnull))
1586 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1587 {
1588         if (refcount_dec_and_test(&req->refs)) {
1589                 io_req_find_next(req, nxtptr);
1590                 __io_free_req(req);
1591         }
1592 }
1593
1594 static void io_put_req(struct io_kiocb *req)
1595 {
1596         if (refcount_dec_and_test(&req->refs))
1597                 io_free_req(req);
1598 }
1599
1600 static void io_steal_work(struct io_kiocb *req,
1601                           struct io_wq_work **workptr)
1602 {
1603         /*
1604          * It's in an io-wq worker, so there always should be at least
1605          * one reference, which will be dropped in io_put_work() just
1606          * after the current handler returns.
1607          *
1608          * It also means, that if the counter dropped to 1, then there is
1609          * no asynchronous users left, so it's safe to steal the next work.
1610          */
1611         if (refcount_read(&req->refs) == 1) {
1612                 struct io_kiocb *nxt = NULL;
1613
1614                 io_req_find_next(req, &nxt);
1615                 if (nxt)
1616                         io_wq_assign_next(workptr, nxt);
1617         }
1618 }
1619
1620 /*
1621  * Must only be used if we don't need to care about links, usually from
1622  * within the completion handling itself.
1623  */
1624 static void __io_double_put_req(struct io_kiocb *req)
1625 {
1626         /* drop both submit and complete references */
1627         if (refcount_sub_and_test(2, &req->refs))
1628                 __io_free_req(req);
1629 }
1630
1631 static void io_double_put_req(struct io_kiocb *req)
1632 {
1633         /* drop both submit and complete references */
1634         if (refcount_sub_and_test(2, &req->refs))
1635                 io_free_req(req);
1636 }
1637
1638 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1639 {
1640         struct io_rings *rings = ctx->rings;
1641
1642         if (test_bit(0, &ctx->cq_check_overflow)) {
1643                 /*
1644                  * noflush == true is from the waitqueue handler, just ensure
1645                  * we wake up the task, and the next invocation will flush the
1646                  * entries. We cannot safely to it from here.
1647                  */
1648                 if (noflush && !list_empty(&ctx->cq_overflow_list))
1649                         return -1U;
1650
1651                 io_cqring_overflow_flush(ctx, false);
1652         }
1653
1654         /* See comment at the top of this file */
1655         smp_rmb();
1656         return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
1657 }
1658
1659 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1660 {
1661         struct io_rings *rings = ctx->rings;
1662
1663         /* make sure SQ entry isn't read before tail */
1664         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1665 }
1666
1667 static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
1668 {
1669         if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
1670                 return false;
1671
1672         if (req->file || req->io)
1673                 rb->need_iter++;
1674
1675         rb->reqs[rb->to_free++] = req;
1676         if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1677                 io_free_req_many(req->ctx, rb);
1678         return true;
1679 }
1680
1681 static int io_put_kbuf(struct io_kiocb *req)
1682 {
1683         struct io_buffer *kbuf;
1684         int cflags;
1685
1686         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
1687         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
1688         cflags |= IORING_CQE_F_BUFFER;
1689         req->rw.addr = 0;
1690         kfree(kbuf);
1691         return cflags;
1692 }
1693
1694 /*
1695  * Find and free completed poll iocbs
1696  */
1697 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1698                                struct list_head *done)
1699 {
1700         struct req_batch rb;
1701         struct io_kiocb *req;
1702
1703         rb.to_free = rb.need_iter = 0;
1704         while (!list_empty(done)) {
1705                 int cflags = 0;
1706
1707                 req = list_first_entry(done, struct io_kiocb, list);
1708                 list_del(&req->list);
1709
1710                 if (req->flags & REQ_F_BUFFER_SELECTED)
1711                         cflags = io_put_kbuf(req);
1712
1713                 __io_cqring_fill_event(req, req->result, cflags);
1714                 (*nr_events)++;
1715
1716                 if (refcount_dec_and_test(&req->refs) &&
1717                     !io_req_multi_free(&rb, req))
1718                         io_free_req(req);
1719         }
1720
1721         io_commit_cqring(ctx);
1722         if (ctx->flags & IORING_SETUP_SQPOLL)
1723                 io_cqring_ev_posted(ctx);
1724         io_free_req_many(ctx, &rb);
1725 }
1726
1727 static void io_iopoll_queue(struct list_head *again)
1728 {
1729         struct io_kiocb *req;
1730
1731         do {
1732                 req = list_first_entry(again, struct io_kiocb, list);
1733                 list_del(&req->list);
1734                 refcount_inc(&req->refs);
1735                 io_queue_async_work(req);
1736         } while (!list_empty(again));
1737 }
1738
1739 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1740                         long min)
1741 {
1742         struct io_kiocb *req, *tmp;
1743         LIST_HEAD(done);
1744         LIST_HEAD(again);
1745         bool spin;
1746         int ret;
1747
1748         /*
1749          * Only spin for completions if we don't have multiple devices hanging
1750          * off our complete list, and we're under the requested amount.
1751          */
1752         spin = !ctx->poll_multi_file && *nr_events < min;
1753
1754         ret = 0;
1755         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1756                 struct kiocb *kiocb = &req->rw.kiocb;
1757
1758                 /*
1759                  * Move completed and retryable entries to our local lists.
1760                  * If we find a request that requires polling, break out
1761                  * and complete those lists first, if we have entries there.
1762                  */
1763                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1764                         list_move_tail(&req->list, &done);
1765                         continue;
1766                 }
1767                 if (!list_empty(&done))
1768                         break;
1769
1770                 if (req->result == -EAGAIN) {
1771                         list_move_tail(&req->list, &again);
1772                         continue;
1773                 }
1774                 if (!list_empty(&again))
1775                         break;
1776
1777                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1778                 if (ret < 0)
1779                         break;
1780
1781                 if (ret && spin)
1782                         spin = false;
1783                 ret = 0;
1784         }
1785
1786         if (!list_empty(&done))
1787                 io_iopoll_complete(ctx, nr_events, &done);
1788
1789         if (!list_empty(&again))
1790                 io_iopoll_queue(&again);
1791
1792         return ret;
1793 }
1794
1795 /*
1796  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1797  * non-spinning poll check - we'll still enter the driver poll loop, but only
1798  * as a non-spinning completion check.
1799  */
1800 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1801                                 long min)
1802 {
1803         while (!list_empty(&ctx->poll_list) && !need_resched()) {
1804                 int ret;
1805
1806                 ret = io_do_iopoll(ctx, nr_events, min);
1807                 if (ret < 0)
1808                         return ret;
1809                 if (!min || *nr_events >= min)
1810                         return 0;
1811         }
1812
1813         return 1;
1814 }
1815
1816 /*
1817  * We can't just wait for polled events to come to us, we have to actively
1818  * find and complete them.
1819  */
1820 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1821 {
1822         if (!(ctx->flags & IORING_SETUP_IOPOLL))
1823                 return;
1824
1825         mutex_lock(&ctx->uring_lock);
1826         while (!list_empty(&ctx->poll_list)) {
1827                 unsigned int nr_events = 0;
1828
1829                 io_iopoll_getevents(ctx, &nr_events, 1);
1830
1831                 /*
1832                  * Ensure we allow local-to-the-cpu processing to take place,
1833                  * in this case we need to ensure that we reap all events.
1834                  */
1835                 cond_resched();
1836         }
1837         mutex_unlock(&ctx->uring_lock);
1838 }
1839
1840 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1841                            long min)
1842 {
1843         int iters = 0, ret = 0;
1844
1845         /*
1846          * We disallow the app entering submit/complete with polling, but we
1847          * still need to lock the ring to prevent racing with polled issue
1848          * that got punted to a workqueue.
1849          */
1850         mutex_lock(&ctx->uring_lock);
1851         do {
1852                 int tmin = 0;
1853
1854                 /*
1855                  * Don't enter poll loop if we already have events pending.
1856                  * If we do, we can potentially be spinning for commands that
1857                  * already triggered a CQE (eg in error).
1858                  */
1859                 if (io_cqring_events(ctx, false))
1860                         break;
1861
1862                 /*
1863                  * If a submit got punted to a workqueue, we can have the
1864                  * application entering polling for a command before it gets
1865                  * issued. That app will hold the uring_lock for the duration
1866                  * of the poll right here, so we need to take a breather every
1867                  * now and then to ensure that the issue has a chance to add
1868                  * the poll to the issued list. Otherwise we can spin here
1869                  * forever, while the workqueue is stuck trying to acquire the
1870                  * very same mutex.
1871                  */
1872                 if (!(++iters & 7)) {
1873                         mutex_unlock(&ctx->uring_lock);
1874                         mutex_lock(&ctx->uring_lock);
1875                 }
1876
1877                 if (*nr_events < min)
1878                         tmin = min - *nr_events;
1879
1880                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1881                 if (ret <= 0)
1882                         break;
1883                 ret = 0;
1884         } while (min && !*nr_events && !need_resched());
1885
1886         mutex_unlock(&ctx->uring_lock);
1887         return ret;
1888 }
1889
1890 static void kiocb_end_write(struct io_kiocb *req)
1891 {
1892         /*
1893          * Tell lockdep we inherited freeze protection from submission
1894          * thread.
1895          */
1896         if (req->flags & REQ_F_ISREG) {
1897                 struct inode *inode = file_inode(req->file);
1898
1899                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
1900         }
1901         file_end_write(req->file);
1902 }
1903
1904 static inline void req_set_fail_links(struct io_kiocb *req)
1905 {
1906         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1907                 req->flags |= REQ_F_FAIL_LINK;
1908 }
1909
1910 static void io_complete_rw_common(struct kiocb *kiocb, long res)
1911 {
1912         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1913         int cflags = 0;
1914
1915         if (kiocb->ki_flags & IOCB_WRITE)
1916                 kiocb_end_write(req);
1917
1918         if (res != req->result)
1919                 req_set_fail_links(req);
1920         if (req->flags & REQ_F_BUFFER_SELECTED)
1921                 cflags = io_put_kbuf(req);
1922         __io_cqring_add_event(req, res, cflags);
1923 }
1924
1925 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1926 {
1927         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1928
1929         io_complete_rw_common(kiocb, res);
1930         io_put_req(req);
1931 }
1932
1933 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1934 {
1935         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1936
1937         if (kiocb->ki_flags & IOCB_WRITE)
1938                 kiocb_end_write(req);
1939
1940         if (res != req->result)
1941                 req_set_fail_links(req);
1942         req->result = res;
1943         if (res != -EAGAIN)
1944                 req->flags |= REQ_F_IOPOLL_COMPLETED;
1945 }
1946
1947 /*
1948  * After the iocb has been issued, it's safe to be found on the poll list.
1949  * Adding the kiocb to the list AFTER submission ensures that we don't
1950  * find it from a io_iopoll_getevents() thread before the issuer is done
1951  * accessing the kiocb cookie.
1952  */
1953 static void io_iopoll_req_issued(struct io_kiocb *req)
1954 {
1955         struct io_ring_ctx *ctx = req->ctx;
1956
1957         /*
1958          * Track whether we have multiple files in our lists. This will impact
1959          * how we do polling eventually, not spinning if we're on potentially
1960          * different devices.
1961          */
1962         if (list_empty(&ctx->poll_list)) {
1963                 ctx->poll_multi_file = false;
1964         } else if (!ctx->poll_multi_file) {
1965                 struct io_kiocb *list_req;
1966
1967                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1968                                                 list);
1969                 if (list_req->file != req->file)
1970                         ctx->poll_multi_file = true;
1971         }
1972
1973         /*
1974          * For fast devices, IO may have already completed. If it has, add
1975          * it to the front so we find it first.
1976          */
1977         if (req->flags & REQ_F_IOPOLL_COMPLETED)
1978                 list_add(&req->list, &ctx->poll_list);
1979         else
1980                 list_add_tail(&req->list, &ctx->poll_list);
1981
1982         if ((ctx->flags & IORING_SETUP_SQPOLL) &&
1983             wq_has_sleeper(&ctx->sqo_wait))
1984                 wake_up(&ctx->sqo_wait);
1985 }
1986
1987 static void io_file_put(struct io_submit_state *state)
1988 {
1989         if (state->file) {
1990                 int diff = state->has_refs - state->used_refs;
1991
1992                 if (diff)
1993                         fput_many(state->file, diff);
1994                 state->file = NULL;
1995         }
1996 }
1997
1998 /*
1999  * Get as many references to a file as we have IOs left in this submission,
2000  * assuming most submissions are for one file, or at least that each file
2001  * has more than one submission.
2002  */
2003 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2004 {
2005         if (!state)
2006                 return fget(fd);
2007
2008         if (state->file) {
2009                 if (state->fd == fd) {
2010                         state->used_refs++;
2011                         state->ios_left--;
2012                         return state->file;
2013                 }
2014                 io_file_put(state);
2015         }
2016         state->file = fget_many(fd, state->ios_left);
2017         if (!state->file)
2018                 return NULL;
2019
2020         state->fd = fd;
2021         state->has_refs = state->ios_left;
2022         state->used_refs = 1;
2023         state->ios_left--;
2024         return state->file;
2025 }
2026
2027 /*
2028  * If we tracked the file through the SCM inflight mechanism, we could support
2029  * any file. For now, just ensure that anything potentially problematic is done
2030  * inline.
2031  */
2032 static bool io_file_supports_async(struct file *file, int rw)
2033 {
2034         umode_t mode = file_inode(file)->i_mode;
2035
2036         if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
2037                 return true;
2038         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
2039                 return true;
2040
2041         if (!(file->f_mode & FMODE_NOWAIT))
2042                 return false;
2043
2044         if (rw == READ)
2045                 return file->f_op->read_iter != NULL;
2046
2047         return file->f_op->write_iter != NULL;
2048 }
2049
2050 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2051                       bool force_nonblock)
2052 {
2053         struct io_ring_ctx *ctx = req->ctx;
2054         struct kiocb *kiocb = &req->rw.kiocb;
2055         unsigned ioprio;
2056         int ret;
2057
2058         if (S_ISREG(file_inode(req->file)->i_mode))
2059                 req->flags |= REQ_F_ISREG;
2060
2061         kiocb->ki_pos = READ_ONCE(sqe->off);
2062         if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2063                 req->flags |= REQ_F_CUR_POS;
2064                 kiocb->ki_pos = req->file->f_pos;
2065         }
2066         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2067         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2068         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2069         if (unlikely(ret))
2070                 return ret;
2071
2072         ioprio = READ_ONCE(sqe->ioprio);
2073         if (ioprio) {
2074                 ret = ioprio_check_cap(ioprio);
2075                 if (ret)
2076                         return ret;
2077
2078                 kiocb->ki_ioprio = ioprio;
2079         } else
2080                 kiocb->ki_ioprio = get_current_ioprio();
2081
2082         /* don't allow async punt if RWF_NOWAIT was requested */
2083         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2084             (req->file->f_flags & O_NONBLOCK))
2085                 req->flags |= REQ_F_NOWAIT;
2086
2087         if (force_nonblock)
2088                 kiocb->ki_flags |= IOCB_NOWAIT;
2089
2090         if (ctx->flags & IORING_SETUP_IOPOLL) {
2091                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2092                     !kiocb->ki_filp->f_op->iopoll)
2093                         return -EOPNOTSUPP;
2094
2095                 kiocb->ki_flags |= IOCB_HIPRI;
2096                 kiocb->ki_complete = io_complete_rw_iopoll;
2097                 req->result = 0;
2098         } else {
2099                 if (kiocb->ki_flags & IOCB_HIPRI)
2100                         return -EINVAL;
2101                 kiocb->ki_complete = io_complete_rw;
2102         }
2103
2104         req->rw.addr = READ_ONCE(sqe->addr);
2105         req->rw.len = READ_ONCE(sqe->len);
2106         req->buf_index = READ_ONCE(sqe->buf_index);
2107         return 0;
2108 }
2109
2110 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2111 {
2112         switch (ret) {
2113         case -EIOCBQUEUED:
2114                 break;
2115         case -ERESTARTSYS:
2116         case -ERESTARTNOINTR:
2117         case -ERESTARTNOHAND:
2118         case -ERESTART_RESTARTBLOCK:
2119                 /*
2120                  * We can't just restart the syscall, since previously
2121                  * submitted sqes may already be in progress. Just fail this
2122                  * IO with EINTR.
2123                  */
2124                 ret = -EINTR;
2125                 /* fall through */
2126         default:
2127                 kiocb->ki_complete(kiocb, ret, 0);
2128         }
2129 }
2130
2131 static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
2132 {
2133         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2134
2135         if (req->flags & REQ_F_CUR_POS)
2136                 req->file->f_pos = kiocb->ki_pos;
2137         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2138                 io_complete_rw(kiocb, ret, 0);
2139         else
2140                 io_rw_done(kiocb, ret);
2141 }
2142
2143 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
2144                                struct iov_iter *iter)
2145 {
2146         struct io_ring_ctx *ctx = req->ctx;
2147         size_t len = req->rw.len;
2148         struct io_mapped_ubuf *imu;
2149         u16 index, buf_index;
2150         size_t offset;
2151         u64 buf_addr;
2152
2153         /* attempt to use fixed buffers without having provided iovecs */
2154         if (unlikely(!ctx->user_bufs))
2155                 return -EFAULT;
2156
2157         buf_index = req->buf_index;
2158         if (unlikely(buf_index >= ctx->nr_user_bufs))
2159                 return -EFAULT;
2160
2161         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2162         imu = &ctx->user_bufs[index];
2163         buf_addr = req->rw.addr;
2164
2165         /* overflow */
2166         if (buf_addr + len < buf_addr)
2167                 return -EFAULT;
2168         /* not inside the mapped region */
2169         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2170                 return -EFAULT;
2171
2172         /*
2173          * May not be a start of buffer, set size appropriately
2174          * and advance us to the beginning.
2175          */
2176         offset = buf_addr - imu->ubuf;
2177         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2178
2179         if (offset) {
2180                 /*
2181                  * Don't use iov_iter_advance() here, as it's really slow for
2182                  * using the latter parts of a big fixed buffer - it iterates
2183                  * over each segment manually. We can cheat a bit here, because
2184                  * we know that:
2185                  *
2186                  * 1) it's a BVEC iter, we set it up
2187                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2188                  *    first and last bvec
2189                  *
2190                  * So just find our index, and adjust the iterator afterwards.
2191                  * If the offset is within the first bvec (or the whole first
2192                  * bvec, just use iov_iter_advance(). This makes it easier
2193                  * since we can just skip the first segment, which may not
2194                  * be PAGE_SIZE aligned.
2195                  */
2196                 const struct bio_vec *bvec = imu->bvec;
2197
2198                 if (offset <= bvec->bv_len) {
2199                         iov_iter_advance(iter, offset);
2200                 } else {
2201                         unsigned long seg_skip;
2202
2203                         /* skip first vec */
2204                         offset -= bvec->bv_len;
2205                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2206
2207                         iter->bvec = bvec + seg_skip;
2208                         iter->nr_segs -= seg_skip;
2209                         iter->count -= bvec->bv_len + offset;
2210                         iter->iov_offset = offset & ~PAGE_MASK;
2211                 }
2212         }
2213
2214         return len;
2215 }
2216
2217 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2218 {
2219         if (needs_lock)
2220                 mutex_unlock(&ctx->uring_lock);
2221 }
2222
2223 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2224 {
2225         /*
2226          * "Normal" inline submissions always hold the uring_lock, since we
2227          * grab it from the system call. Same is true for the SQPOLL offload.
2228          * The only exception is when we've detached the request and issue it
2229          * from an async worker thread, grab the lock for that case.
2230          */
2231         if (needs_lock)
2232                 mutex_lock(&ctx->uring_lock);
2233 }
2234
2235 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2236                                           int bgid, struct io_buffer *kbuf,
2237                                           bool needs_lock)
2238 {
2239         struct io_buffer *head;
2240
2241         if (req->flags & REQ_F_BUFFER_SELECTED)
2242                 return kbuf;
2243
2244         io_ring_submit_lock(req->ctx, needs_lock);
2245
2246         lockdep_assert_held(&req->ctx->uring_lock);
2247
2248         head = idr_find(&req->ctx->io_buffer_idr, bgid);
2249         if (head) {
2250                 if (!list_empty(&head->list)) {
2251                         kbuf = list_last_entry(&head->list, struct io_buffer,
2252                                                         list);
2253                         list_del(&kbuf->list);
2254                 } else {
2255                         kbuf = head;
2256                         idr_remove(&req->ctx->io_buffer_idr, bgid);
2257                 }
2258                 if (*len > kbuf->len)
2259                         *len = kbuf->len;
2260         } else {
2261                 kbuf = ERR_PTR(-ENOBUFS);
2262         }
2263
2264         io_ring_submit_unlock(req->ctx, needs_lock);
2265
2266         return kbuf;
2267 }
2268
2269 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2270                                         bool needs_lock)
2271 {
2272         struct io_buffer *kbuf;
2273         u16 bgid;
2274
2275         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2276         bgid = req->buf_index;
2277         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2278         if (IS_ERR(kbuf))
2279                 return kbuf;
2280         req->rw.addr = (u64) (unsigned long) kbuf;
2281         req->flags |= REQ_F_BUFFER_SELECTED;
2282         return u64_to_user_ptr(kbuf->addr);
2283 }
2284
2285 #ifdef CONFIG_COMPAT
2286 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2287                                 bool needs_lock)
2288 {
2289         struct compat_iovec __user *uiov;
2290         compat_ssize_t clen;
2291         void __user *buf;
2292         ssize_t len;
2293
2294         uiov = u64_to_user_ptr(req->rw.addr);
2295         if (!access_ok(uiov, sizeof(*uiov)))
2296                 return -EFAULT;
2297         if (__get_user(clen, &uiov->iov_len))
2298                 return -EFAULT;
2299         if (clen < 0)
2300                 return -EINVAL;
2301
2302         len = clen;
2303         buf = io_rw_buffer_select(req, &len, needs_lock);
2304         if (IS_ERR(buf))
2305                 return PTR_ERR(buf);
2306         iov[0].iov_base = buf;
2307         iov[0].iov_len = (compat_size_t) len;
2308         return 0;
2309 }
2310 #endif
2311
2312 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2313                                       bool needs_lock)
2314 {
2315         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2316         void __user *buf;
2317         ssize_t len;
2318
2319         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2320                 return -EFAULT;
2321
2322         len = iov[0].iov_len;
2323         if (len < 0)
2324                 return -EINVAL;
2325         buf = io_rw_buffer_select(req, &len, needs_lock);
2326         if (IS_ERR(buf))
2327                 return PTR_ERR(buf);
2328         iov[0].iov_base = buf;
2329         iov[0].iov_len = len;
2330         return 0;
2331 }
2332
2333 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2334                                     bool needs_lock)
2335 {
2336         if (req->flags & REQ_F_BUFFER_SELECTED)
2337                 return 0;
2338         if (!req->rw.len)
2339                 return 0;
2340         else if (req->rw.len > 1)
2341                 return -EINVAL;
2342
2343 #ifdef CONFIG_COMPAT
2344         if (req->ctx->compat)
2345                 return io_compat_import(req, iov, needs_lock);
2346 #endif
2347
2348         return __io_iov_buffer_select(req, iov, needs_lock);
2349 }
2350
2351 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2352                                struct iovec **iovec, struct iov_iter *iter,
2353                                bool needs_lock)
2354 {
2355         void __user *buf = u64_to_user_ptr(req->rw.addr);
2356         size_t sqe_len = req->rw.len;
2357         ssize_t ret;
2358         u8 opcode;
2359
2360         opcode = req->opcode;
2361         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2362                 *iovec = NULL;
2363                 return io_import_fixed(req, rw, iter);
2364         }
2365
2366         /* buffer index only valid with fixed read/write, or buffer select  */
2367         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2368                 return -EINVAL;
2369
2370         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2371                 if (req->flags & REQ_F_BUFFER_SELECT) {
2372                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2373                         if (IS_ERR(buf)) {
2374                                 *iovec = NULL;
2375                                 return PTR_ERR(buf);
2376                         }
2377                         req->rw.len = sqe_len;
2378                 }
2379
2380                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2381                 *iovec = NULL;
2382                 return ret < 0 ? ret : sqe_len;
2383         }
2384
2385         if (req->io) {
2386                 struct io_async_rw *iorw = &req->io->rw;
2387
2388                 *iovec = iorw->iov;
2389                 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2390                 if (iorw->iov == iorw->fast_iov)
2391                         *iovec = NULL;
2392                 return iorw->size;
2393         }
2394
2395         if (req->flags & REQ_F_BUFFER_SELECT) {
2396                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2397                 if (!ret) {
2398                         ret = (*iovec)->iov_len;
2399                         iov_iter_init(iter, rw, *iovec, 1, ret);
2400                 }
2401                 *iovec = NULL;
2402                 return ret;
2403         }
2404
2405 #ifdef CONFIG_COMPAT
2406         if (req->ctx->compat)
2407                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2408                                                 iovec, iter);
2409 #endif
2410
2411         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2412 }
2413
2414 /*
2415  * For files that don't have ->read_iter() and ->write_iter(), handle them
2416  * by looping over ->read() or ->write() manually.
2417  */
2418 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2419                            struct iov_iter *iter)
2420 {
2421         ssize_t ret = 0;
2422
2423         /*
2424          * Don't support polled IO through this interface, and we can't
2425          * support non-blocking either. For the latter, this just causes
2426          * the kiocb to be handled from an async context.
2427          */
2428         if (kiocb->ki_flags & IOCB_HIPRI)
2429                 return -EOPNOTSUPP;
2430         if (kiocb->ki_flags & IOCB_NOWAIT)
2431                 return -EAGAIN;
2432
2433         while (iov_iter_count(iter)) {
2434                 struct iovec iovec;
2435                 ssize_t nr;
2436
2437                 if (!iov_iter_is_bvec(iter)) {
2438                         iovec = iov_iter_iovec(iter);
2439                 } else {
2440                         /* fixed buffers import bvec */
2441                         iovec.iov_base = kmap(iter->bvec->bv_page)
2442                                                 + iter->iov_offset;
2443                         iovec.iov_len = min(iter->count,
2444                                         iter->bvec->bv_len - iter->iov_offset);
2445                 }
2446
2447                 if (rw == READ) {
2448                         nr = file->f_op->read(file, iovec.iov_base,
2449                                               iovec.iov_len, &kiocb->ki_pos);
2450                 } else {
2451                         nr = file->f_op->write(file, iovec.iov_base,
2452                                                iovec.iov_len, &kiocb->ki_pos);
2453                 }
2454
2455                 if (iov_iter_is_bvec(iter))
2456                         kunmap(iter->bvec->bv_page);
2457
2458                 if (nr < 0) {
2459                         if (!ret)
2460                                 ret = nr;
2461                         break;
2462                 }
2463                 ret += nr;
2464                 if (nr != iovec.iov_len)
2465                         break;
2466                 iov_iter_advance(iter, nr);
2467         }
2468
2469         return ret;
2470 }
2471
2472 static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
2473                           struct iovec *iovec, struct iovec *fast_iov,
2474                           struct iov_iter *iter)
2475 {
2476         req->io->rw.nr_segs = iter->nr_segs;
2477         req->io->rw.size = io_size;
2478         req->io->rw.iov = iovec;
2479         if (!req->io->rw.iov) {
2480                 req->io->rw.iov = req->io->rw.fast_iov;
2481                 if (req->io->rw.iov != fast_iov)
2482                         memcpy(req->io->rw.iov, fast_iov,
2483                                sizeof(struct iovec) * iter->nr_segs);
2484         } else {
2485                 req->flags |= REQ_F_NEED_CLEANUP;
2486         }
2487 }
2488
2489 static inline int __io_alloc_async_ctx(struct io_kiocb *req)
2490 {
2491         req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
2492         return req->io == NULL;
2493 }
2494
2495 static int io_alloc_async_ctx(struct io_kiocb *req)
2496 {
2497         if (!io_op_defs[req->opcode].async_ctx)
2498                 return 0;
2499
2500         return  __io_alloc_async_ctx(req);
2501 }
2502
2503 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2504                              struct iovec *iovec, struct iovec *fast_iov,
2505                              struct iov_iter *iter)
2506 {
2507         if (!io_op_defs[req->opcode].async_ctx)
2508                 return 0;
2509         if (!req->io) {
2510                 if (__io_alloc_async_ctx(req))
2511                         return -ENOMEM;
2512
2513                 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2514         }
2515         return 0;
2516 }
2517
2518 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2519                         bool force_nonblock)
2520 {
2521         struct io_async_ctx *io;
2522         struct iov_iter iter;
2523         ssize_t ret;
2524
2525         ret = io_prep_rw(req, sqe, force_nonblock);
2526         if (ret)
2527                 return ret;
2528
2529         if (unlikely(!(req->file->f_mode & FMODE_READ)))
2530                 return -EBADF;
2531
2532         /* either don't need iovec imported or already have it */
2533         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2534                 return 0;
2535
2536         io = req->io;
2537         io->rw.iov = io->rw.fast_iov;
2538         req->io = NULL;
2539         ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
2540         req->io = io;
2541         if (ret < 0)
2542                 return ret;
2543
2544         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2545         return 0;
2546 }
2547
2548 static int io_read(struct io_kiocb *req, bool force_nonblock)
2549 {
2550         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2551         struct kiocb *kiocb = &req->rw.kiocb;
2552         struct iov_iter iter;
2553         size_t iov_count;
2554         ssize_t io_size, ret;
2555
2556         ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
2557         if (ret < 0)
2558                 return ret;
2559
2560         /* Ensure we clear previously set non-block flag */
2561         if (!force_nonblock)
2562                 kiocb->ki_flags &= ~IOCB_NOWAIT;
2563
2564         req->result = 0;
2565         io_size = ret;
2566         if (req->flags & REQ_F_LINK_HEAD)
2567                 req->result = io_size;
2568
2569         /*
2570          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2571          * we know to async punt it even if it was opened O_NONBLOCK
2572          */
2573         if (force_nonblock && !io_file_supports_async(req->file, READ))
2574                 goto copy_iov;
2575
2576         iov_count = iov_iter_count(&iter);
2577         ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2578         if (!ret) {
2579                 ssize_t ret2;
2580
2581                 if (req->file->f_op->read_iter)
2582                         ret2 = call_read_iter(req->file, kiocb, &iter);
2583                 else
2584                         ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
2585
2586                 /* Catch -EAGAIN return for forced non-blocking submission */
2587                 if (!force_nonblock || ret2 != -EAGAIN) {
2588                         kiocb_done(kiocb, ret2);
2589                 } else {
2590 copy_iov:
2591                         ret = io_setup_async_rw(req, io_size, iovec,
2592                                                 inline_vecs, &iter);
2593                         if (ret)
2594                                 goto out_free;
2595                         /* any defer here is final, must blocking retry */
2596                         if (!(req->flags & REQ_F_NOWAIT) &&
2597                             !file_can_poll(req->file))
2598                                 req->flags |= REQ_F_MUST_PUNT;
2599                         return -EAGAIN;
2600                 }
2601         }
2602 out_free:
2603         kfree(iovec);
2604         req->flags &= ~REQ_F_NEED_CLEANUP;
2605         return ret;
2606 }
2607
2608 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2609                          bool force_nonblock)
2610 {
2611         struct io_async_ctx *io;
2612         struct iov_iter iter;
2613         ssize_t ret;
2614
2615         ret = io_prep_rw(req, sqe, force_nonblock);
2616         if (ret)
2617                 return ret;
2618
2619         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2620                 return -EBADF;
2621
2622         req->fsize = rlimit(RLIMIT_FSIZE);
2623
2624         /* either don't need iovec imported or already have it */
2625         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2626                 return 0;
2627
2628         io = req->io;
2629         io->rw.iov = io->rw.fast_iov;
2630         req->io = NULL;
2631         ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
2632         req->io = io;
2633         if (ret < 0)
2634                 return ret;
2635
2636         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2637         return 0;
2638 }
2639
2640 static int io_write(struct io_kiocb *req, bool force_nonblock)
2641 {
2642         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2643         struct kiocb *kiocb = &req->rw.kiocb;
2644         struct iov_iter iter;
2645         size_t iov_count;
2646         ssize_t ret, io_size;
2647
2648         ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
2649         if (ret < 0)
2650                 return ret;
2651
2652         /* Ensure we clear previously set non-block flag */
2653         if (!force_nonblock)
2654                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
2655
2656         req->result = 0;
2657         io_size = ret;
2658         if (req->flags & REQ_F_LINK_HEAD)
2659                 req->result = io_size;
2660
2661         /*
2662          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2663          * we know to async punt it even if it was opened O_NONBLOCK
2664          */
2665         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
2666                 goto copy_iov;
2667
2668         /* file path doesn't support NOWAIT for non-direct_IO */
2669         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2670             (req->flags & REQ_F_ISREG))
2671                 goto copy_iov;
2672
2673         iov_count = iov_iter_count(&iter);
2674         ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2675         if (!ret) {
2676                 ssize_t ret2;
2677
2678                 /*
2679                  * Open-code file_start_write here to grab freeze protection,
2680                  * which will be released by another thread in
2681                  * io_complete_rw().  Fool lockdep by telling it the lock got
2682                  * released so that it doesn't complain about the held lock when
2683                  * we return to userspace.
2684                  */
2685                 if (req->flags & REQ_F_ISREG) {
2686                         __sb_start_write(file_inode(req->file)->i_sb,
2687                                                 SB_FREEZE_WRITE, true);
2688                         __sb_writers_release(file_inode(req->file)->i_sb,
2689                                                 SB_FREEZE_WRITE);
2690                 }
2691                 kiocb->ki_flags |= IOCB_WRITE;
2692
2693                 if (!force_nonblock)
2694                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
2695
2696                 if (req->file->f_op->write_iter)
2697                         ret2 = call_write_iter(req->file, kiocb, &iter);
2698                 else
2699                         ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
2700
2701                 if (!force_nonblock)
2702                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
2703
2704                 /*
2705                  * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
2706                  * retry them without IOCB_NOWAIT.
2707                  */
2708                 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
2709                         ret2 = -EAGAIN;
2710                 if (!force_nonblock || ret2 != -EAGAIN) {
2711                         kiocb_done(kiocb, ret2);
2712                 } else {
2713 copy_iov:
2714                         ret = io_setup_async_rw(req, io_size, iovec,
2715                                                 inline_vecs, &iter);
2716                         if (ret)
2717                                 goto out_free;
2718                         /* any defer here is final, must blocking retry */
2719                         if (!file_can_poll(req->file))
2720                                 req->flags |= REQ_F_MUST_PUNT;
2721                         return -EAGAIN;
2722                 }
2723         }
2724 out_free:
2725         req->flags &= ~REQ_F_NEED_CLEANUP;
2726         kfree(iovec);
2727         return ret;
2728 }
2729
2730 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2731 {
2732         struct io_splice* sp = &req->splice;
2733         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
2734         int ret;
2735
2736         if (req->flags & REQ_F_NEED_CLEANUP)
2737                 return 0;
2738
2739         sp->file_in = NULL;
2740         sp->off_in = READ_ONCE(sqe->splice_off_in);
2741         sp->off_out = READ_ONCE(sqe->off);
2742         sp->len = READ_ONCE(sqe->len);
2743         sp->flags = READ_ONCE(sqe->splice_flags);
2744
2745         if (unlikely(sp->flags & ~valid_flags))
2746                 return -EINVAL;
2747
2748         ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
2749                           (sp->flags & SPLICE_F_FD_IN_FIXED));
2750         if (ret)
2751                 return ret;
2752         req->flags |= REQ_F_NEED_CLEANUP;
2753
2754         if (!S_ISREG(file_inode(sp->file_in)->i_mode))
2755                 req->work.flags |= IO_WQ_WORK_UNBOUND;
2756
2757         return 0;
2758 }
2759
2760 static int io_splice(struct io_kiocb *req, bool force_nonblock)
2761 {
2762         struct io_splice *sp = &req->splice;
2763         struct file *in = sp->file_in;
2764         struct file *out = sp->file_out;
2765         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
2766         loff_t *poff_in, *poff_out;
2767         long ret = 0;
2768
2769         if (force_nonblock)
2770                 return -EAGAIN;
2771
2772         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
2773         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
2774
2775         if (sp->len)
2776                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
2777
2778         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
2779         req->flags &= ~REQ_F_NEED_CLEANUP;
2780
2781         io_cqring_add_event(req, ret);
2782         if (ret != sp->len)
2783                 req_set_fail_links(req);
2784         io_put_req(req);
2785         return 0;
2786 }
2787
2788 /*
2789  * IORING_OP_NOP just posts a completion event, nothing else.
2790  */
2791 static int io_nop(struct io_kiocb *req)
2792 {
2793         struct io_ring_ctx *ctx = req->ctx;
2794
2795         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2796                 return -EINVAL;
2797
2798         io_cqring_add_event(req, 0);
2799         io_put_req(req);
2800         return 0;
2801 }
2802
2803 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2804 {
2805         struct io_ring_ctx *ctx = req->ctx;
2806
2807         if (!req->file)
2808                 return -EBADF;
2809
2810         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2811                 return -EINVAL;
2812         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2813                 return -EINVAL;
2814
2815         req->sync.flags = READ_ONCE(sqe->fsync_flags);
2816         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2817                 return -EINVAL;
2818
2819         req->sync.off = READ_ONCE(sqe->off);
2820         req->sync.len = READ_ONCE(sqe->len);
2821         return 0;
2822 }
2823
2824 static bool io_req_cancelled(struct io_kiocb *req)
2825 {
2826         if (req->work.flags & IO_WQ_WORK_CANCEL) {
2827                 req_set_fail_links(req);
2828                 io_cqring_add_event(req, -ECANCELED);
2829                 io_put_req(req);
2830                 return true;
2831         }
2832
2833         return false;
2834 }
2835
2836 static void __io_fsync(struct io_kiocb *req)
2837 {
2838         loff_t end = req->sync.off + req->sync.len;
2839         int ret;
2840
2841         ret = vfs_fsync_range(req->file, req->sync.off,
2842                                 end > 0 ? end : LLONG_MAX,
2843                                 req->sync.flags & IORING_FSYNC_DATASYNC);
2844         if (ret < 0)
2845                 req_set_fail_links(req);
2846         io_cqring_add_event(req, ret);
2847         io_put_req(req);
2848 }
2849
2850 static void io_fsync_finish(struct io_wq_work **workptr)
2851 {
2852         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2853
2854         if (io_req_cancelled(req))
2855                 return;
2856         __io_fsync(req);
2857         io_steal_work(req, workptr);
2858 }
2859
2860 static int io_fsync(struct io_kiocb *req, bool force_nonblock)
2861 {
2862         /* fsync always requires a blocking context */
2863         if (force_nonblock) {
2864                 req->work.func = io_fsync_finish;
2865                 return -EAGAIN;
2866         }
2867         __io_fsync(req);
2868         return 0;
2869 }
2870
2871 static void __io_fallocate(struct io_kiocb *req)
2872 {
2873         int ret;
2874
2875         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
2876         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2877                                 req->sync.len);
2878         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
2879         if (ret < 0)
2880                 req_set_fail_links(req);
2881         io_cqring_add_event(req, ret);
2882         io_put_req(req);
2883 }
2884
2885 static void io_fallocate_finish(struct io_wq_work **workptr)
2886 {
2887         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2888
2889         if (io_req_cancelled(req))
2890                 return;
2891         __io_fallocate(req);
2892         io_steal_work(req, workptr);
2893 }
2894
2895 static int io_fallocate_prep(struct io_kiocb *req,
2896                              const struct io_uring_sqe *sqe)
2897 {
2898         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2899                 return -EINVAL;
2900
2901         req->sync.off = READ_ONCE(sqe->off);
2902         req->sync.len = READ_ONCE(sqe->addr);
2903         req->sync.mode = READ_ONCE(sqe->len);
2904         req->fsize = rlimit(RLIMIT_FSIZE);
2905         return 0;
2906 }
2907
2908 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
2909 {
2910         /* fallocate always requiring blocking context */
2911         if (force_nonblock) {
2912                 req->work.func = io_fallocate_finish;
2913                 return -EAGAIN;
2914         }
2915
2916         __io_fallocate(req);
2917         return 0;
2918 }
2919
2920 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2921 {
2922         const char __user *fname;
2923         int ret;
2924
2925         if (sqe->ioprio || sqe->buf_index)
2926                 return -EINVAL;
2927         if (req->flags & REQ_F_FIXED_FILE)
2928                 return -EBADF;
2929         if (req->flags & REQ_F_NEED_CLEANUP)
2930                 return 0;
2931
2932         req->open.dfd = READ_ONCE(sqe->fd);
2933         req->open.how.mode = READ_ONCE(sqe->len);
2934         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2935         req->open.how.flags = READ_ONCE(sqe->open_flags);
2936         if (force_o_largefile())
2937                 req->open.how.flags |= O_LARGEFILE;
2938
2939         req->open.filename = getname(fname);
2940         if (IS_ERR(req->open.filename)) {
2941                 ret = PTR_ERR(req->open.filename);
2942                 req->open.filename = NULL;
2943                 return ret;
2944         }
2945
2946         req->open.nofile = rlimit(RLIMIT_NOFILE);
2947         req->flags |= REQ_F_NEED_CLEANUP;
2948         return 0;
2949 }
2950
2951 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2952 {
2953         struct open_how __user *how;
2954         const char __user *fname;
2955         size_t len;
2956         int ret;
2957
2958         if (sqe->ioprio || sqe->buf_index)
2959                 return -EINVAL;
2960         if (req->flags & REQ_F_FIXED_FILE)
2961                 return -EBADF;
2962         if (req->flags & REQ_F_NEED_CLEANUP)
2963                 return 0;
2964
2965         req->open.dfd = READ_ONCE(sqe->fd);
2966         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
2967         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2968         len = READ_ONCE(sqe->len);
2969
2970         if (len < OPEN_HOW_SIZE_VER0)
2971                 return -EINVAL;
2972
2973         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
2974                                         len);
2975         if (ret)
2976                 return ret;
2977
2978         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
2979                 req->open.how.flags |= O_LARGEFILE;
2980
2981         req->open.filename = getname(fname);
2982         if (IS_ERR(req->open.filename)) {
2983                 ret = PTR_ERR(req->open.filename);
2984                 req->open.filename = NULL;
2985                 return ret;
2986         }
2987
2988         req->open.nofile = rlimit(RLIMIT_NOFILE);
2989         req->flags |= REQ_F_NEED_CLEANUP;
2990         return 0;
2991 }
2992
2993 static int io_openat2(struct io_kiocb *req, bool force_nonblock)
2994 {
2995         struct open_flags op;
2996         struct file *file;
2997         int ret;
2998
2999         if (force_nonblock)
3000                 return -EAGAIN;
3001
3002         ret = build_open_flags(&req->open.how, &op);
3003         if (ret)
3004                 goto err;
3005
3006         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3007         if (ret < 0)
3008                 goto err;
3009
3010         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3011         if (IS_ERR(file)) {
3012                 put_unused_fd(ret);
3013                 ret = PTR_ERR(file);
3014         } else {
3015                 fsnotify_open(file);
3016                 fd_install(ret, file);
3017         }
3018 err:
3019         putname(req->open.filename);
3020         req->flags &= ~REQ_F_NEED_CLEANUP;
3021         if (ret < 0)
3022                 req_set_fail_links(req);
3023         io_cqring_add_event(req, ret);
3024         io_put_req(req);
3025         return 0;
3026 }
3027
3028 static int io_openat(struct io_kiocb *req, bool force_nonblock)
3029 {
3030         req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
3031         return io_openat2(req, force_nonblock);
3032 }
3033
3034 static int io_remove_buffers_prep(struct io_kiocb *req,
3035                                   const struct io_uring_sqe *sqe)
3036 {
3037         struct io_provide_buf *p = &req->pbuf;
3038         u64 tmp;
3039
3040         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3041                 return -EINVAL;
3042
3043         tmp = READ_ONCE(sqe->fd);
3044         if (!tmp || tmp > USHRT_MAX)
3045                 return -EINVAL;
3046
3047         memset(p, 0, sizeof(*p));
3048         p->nbufs = tmp;
3049         p->bgid = READ_ONCE(sqe->buf_group);
3050         return 0;
3051 }
3052
3053 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3054                                int bgid, unsigned nbufs)
3055 {
3056         unsigned i = 0;
3057
3058         /* shouldn't happen */
3059         if (!nbufs)
3060                 return 0;
3061
3062         /* the head kbuf is the list itself */
3063         while (!list_empty(&buf->list)) {
3064                 struct io_buffer *nxt;
3065
3066                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3067                 list_del(&nxt->list);
3068                 kfree(nxt);
3069                 if (++i == nbufs)
3070                         return i;
3071         }
3072         i++;
3073         kfree(buf);
3074         idr_remove(&ctx->io_buffer_idr, bgid);
3075
3076         return i;
3077 }
3078
3079 static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
3080 {
3081         struct io_provide_buf *p = &req->pbuf;
3082         struct io_ring_ctx *ctx = req->ctx;
3083         struct io_buffer *head;
3084         int ret = 0;
3085
3086         io_ring_submit_lock(ctx, !force_nonblock);
3087
3088         lockdep_assert_held(&ctx->uring_lock);
3089
3090         ret = -ENOENT;
3091         head = idr_find(&ctx->io_buffer_idr, p->bgid);
3092         if (head)
3093                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3094
3095         io_ring_submit_lock(ctx, !force_nonblock);
3096         if (ret < 0)
3097                 req_set_fail_links(req);
3098         io_cqring_add_event(req, ret);
3099         io_put_req(req);
3100         return 0;
3101 }
3102
3103 static int io_provide_buffers_prep(struct io_kiocb *req,
3104                                    const struct io_uring_sqe *sqe)
3105 {
3106         struct io_provide_buf *p = &req->pbuf;
3107         u64 tmp;
3108
3109         if (sqe->ioprio || sqe->rw_flags)
3110                 return -EINVAL;
3111
3112         tmp = READ_ONCE(sqe->fd);
3113         if (!tmp || tmp > USHRT_MAX)
3114                 return -E2BIG;
3115         p->nbufs = tmp;
3116         p->addr = READ_ONCE(sqe->addr);
3117         p->len = READ_ONCE(sqe->len);
3118
3119         if (!access_ok(u64_to_user_ptr(p->addr), p->len))
3120                 return -EFAULT;
3121
3122         p->bgid = READ_ONCE(sqe->buf_group);
3123         tmp = READ_ONCE(sqe->off);
3124         if (tmp > USHRT_MAX)
3125                 return -E2BIG;
3126         p->bid = tmp;
3127         return 0;
3128 }
3129
3130 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3131 {
3132         struct io_buffer *buf;
3133         u64 addr = pbuf->addr;
3134         int i, bid = pbuf->bid;
3135
3136         for (i = 0; i < pbuf->nbufs; i++) {
3137                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3138                 if (!buf)
3139                         break;
3140
3141                 buf->addr = addr;
3142                 buf->len = pbuf->len;
3143                 buf->bid = bid;
3144                 addr += pbuf->len;
3145                 bid++;
3146                 if (!*head) {
3147                         INIT_LIST_HEAD(&buf->list);
3148                         *head = buf;
3149                 } else {
3150                         list_add_tail(&buf->list, &(*head)->list);
3151                 }
3152         }
3153
3154         return i ? i : -ENOMEM;
3155 }
3156
3157 static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
3158 {
3159         struct io_provide_buf *p = &req->pbuf;
3160         struct io_ring_ctx *ctx = req->ctx;
3161         struct io_buffer *head, *list;
3162         int ret = 0;
3163
3164         io_ring_submit_lock(ctx, !force_nonblock);
3165
3166         lockdep_assert_held(&ctx->uring_lock);
3167
3168         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3169
3170         ret = io_add_buffers(p, &head);
3171         if (ret < 0)
3172                 goto out;
3173
3174         if (!list) {
3175                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3176                                         GFP_KERNEL);
3177                 if (ret < 0) {
3178                         __io_remove_buffers(ctx, head, p->bgid, -1U);
3179                         goto out;
3180                 }
3181         }
3182 out:
3183         io_ring_submit_unlock(ctx, !force_nonblock);
3184         if (ret < 0)
3185                 req_set_fail_links(req);
3186         io_cqring_add_event(req, ret);
3187         io_put_req(req);
3188         return 0;
3189 }
3190
3191 static int io_epoll_ctl_prep(struct io_kiocb *req,
3192                              const struct io_uring_sqe *sqe)
3193 {
3194 #if defined(CONFIG_EPOLL)
3195         if (sqe->ioprio || sqe->buf_index)
3196                 return -EINVAL;
3197
3198         req->epoll.epfd = READ_ONCE(sqe->fd);
3199         req->epoll.op = READ_ONCE(sqe->len);
3200         req->epoll.fd = READ_ONCE(sqe->off);
3201
3202         if (ep_op_has_event(req->epoll.op)) {
3203                 struct epoll_event __user *ev;
3204
3205                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
3206                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
3207                         return -EFAULT;
3208         }
3209
3210         return 0;
3211 #else
3212         return -EOPNOTSUPP;
3213 #endif
3214 }
3215
3216 static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
3217 {
3218 #if defined(CONFIG_EPOLL)
3219         struct io_epoll *ie = &req->epoll;
3220         int ret;
3221
3222         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
3223         if (force_nonblock && ret == -EAGAIN)
3224                 return -EAGAIN;
3225
3226         if (ret < 0)
3227                 req_set_fail_links(req);
3228         io_cqring_add_event(req, ret);
3229         io_put_req(req);
3230         return 0;
3231 #else
3232         return -EOPNOTSUPP;
3233 #endif
3234 }
3235
3236 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3237 {
3238 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3239         if (sqe->ioprio || sqe->buf_index || sqe->off)
3240                 return -EINVAL;
3241
3242         req->madvise.addr = READ_ONCE(sqe->addr);
3243         req->madvise.len = READ_ONCE(sqe->len);
3244         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
3245         return 0;
3246 #else
3247         return -EOPNOTSUPP;
3248 #endif
3249 }
3250
3251 static int io_madvise(struct io_kiocb *req, bool force_nonblock)
3252 {
3253 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3254         struct io_madvise *ma = &req->madvise;
3255         int ret;
3256
3257         if (force_nonblock)
3258                 return -EAGAIN;
3259
3260         ret = do_madvise(ma->addr, ma->len, ma->advice);
3261         if (ret < 0)
3262                 req_set_fail_links(req);
3263         io_cqring_add_event(req, ret);
3264         io_put_req(req);
3265         return 0;
3266 #else
3267         return -EOPNOTSUPP;
3268 #endif
3269 }
3270
3271 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3272 {
3273         if (sqe->ioprio || sqe->buf_index || sqe->addr)
3274                 return -EINVAL;
3275
3276         req->fadvise.offset = READ_ONCE(sqe->off);
3277         req->fadvise.len = READ_ONCE(sqe->len);
3278         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3279         return 0;
3280 }
3281
3282 static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
3283 {
3284         struct io_fadvise *fa = &req->fadvise;
3285         int ret;
3286
3287         if (force_nonblock) {
3288                 switch (fa->advice) {
3289                 case POSIX_FADV_NORMAL:
3290                 case POSIX_FADV_RANDOM:
3291                 case POSIX_FADV_SEQUENTIAL:
3292                         break;
3293                 default:
3294                         return -EAGAIN;
3295                 }
3296         }
3297
3298         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3299         if (ret < 0)
3300                 req_set_fail_links(req);
3301         io_cqring_add_event(req, ret);
3302         io_put_req(req);
3303         return 0;
3304 }
3305
3306 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3307 {
3308         const char __user *fname;
3309         unsigned lookup_flags;
3310         int ret;
3311
3312         if (sqe->ioprio || sqe->buf_index)
3313                 return -EINVAL;
3314         if (req->flags & REQ_F_FIXED_FILE)
3315                 return -EBADF;
3316         if (req->flags & REQ_F_NEED_CLEANUP)
3317                 return 0;
3318
3319         req->open.dfd = READ_ONCE(sqe->fd);
3320         req->open.mask = READ_ONCE(sqe->len);
3321         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3322         req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3323         req->open.how.flags = READ_ONCE(sqe->statx_flags);
3324
3325         if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
3326                 return -EINVAL;
3327
3328         req->open.filename = getname_flags(fname, lookup_flags, NULL);
3329         if (IS_ERR(req->open.filename)) {
3330                 ret = PTR_ERR(req->open.filename);
3331                 req->open.filename = NULL;
3332                 return ret;
3333         }
3334
3335         req->flags |= REQ_F_NEED_CLEANUP;
3336         return 0;
3337 }
3338
3339 static int io_statx(struct io_kiocb *req, bool force_nonblock)
3340 {
3341         struct io_open *ctx = &req->open;
3342         unsigned lookup_flags;
3343         struct path path;
3344         struct kstat stat;
3345         int ret;
3346
3347         if (force_nonblock) {
3348                 /* only need file table for an actual valid fd */
3349                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
3350                         req->flags |= REQ_F_NO_FILE_TABLE;
3351                 return -EAGAIN;
3352         }
3353
3354         if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
3355                 return -EINVAL;
3356
3357 retry:
3358         /* filename_lookup() drops it, keep a reference */
3359         ctx->filename->refcnt++;
3360
3361         ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
3362                                 NULL);
3363         if (ret)
3364                 goto err;
3365
3366         ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
3367         path_put(&path);
3368         if (retry_estale(ret, lookup_flags)) {
3369                 lookup_flags |= LOOKUP_REVAL;
3370                 goto retry;
3371         }
3372         if (!ret)
3373                 ret = cp_statx(&stat, ctx->buffer);
3374 err:
3375         putname(ctx->filename);
3376         req->flags &= ~REQ_F_NEED_CLEANUP;
3377         if (ret < 0)
3378                 req_set_fail_links(req);
3379         io_cqring_add_event(req, ret);
3380         io_put_req(req);
3381         return 0;
3382 }
3383
3384 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3385 {
3386         /*
3387          * If we queue this for async, it must not be cancellable. That would
3388          * leave the 'file' in an undeterminate state.
3389          */
3390         req->work.flags |= IO_WQ_WORK_NO_CANCEL;
3391
3392         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
3393             sqe->rw_flags || sqe->buf_index)
3394                 return -EINVAL;
3395         if (req->flags & REQ_F_FIXED_FILE)
3396                 return -EBADF;
3397
3398         req->close.fd = READ_ONCE(sqe->fd);
3399         if (req->file->f_op == &io_uring_fops ||
3400             req->close.fd == req->ctx->ring_fd)
3401                 return -EBADF;
3402
3403         return 0;
3404 }
3405
3406 /* only called when __close_fd_get_file() is done */
3407 static void __io_close_finish(struct io_kiocb *req)
3408 {
3409         int ret;
3410
3411         ret = filp_close(req->close.put_file, req->work.files);
3412         if (ret < 0)
3413                 req_set_fail_links(req);
3414         io_cqring_add_event(req, ret);
3415         fput(req->close.put_file);
3416         io_put_req(req);
3417 }
3418
3419 static void io_close_finish(struct io_wq_work **workptr)
3420 {
3421         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3422
3423         /* not cancellable, don't do io_req_cancelled() */
3424         __io_close_finish(req);
3425         io_steal_work(req, workptr);
3426 }
3427
3428 static int io_close(struct io_kiocb *req, bool force_nonblock)
3429 {
3430         int ret;
3431
3432         req->close.put_file = NULL;
3433         ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
3434         if (ret < 0)
3435                 return ret;
3436
3437         /* if the file has a flush method, be safe and punt to async */
3438         if (req->close.put_file->f_op->flush && force_nonblock) {
3439                 /* submission ref will be dropped, take it for async */
3440                 refcount_inc(&req->refs);
3441
3442                 req->work.func = io_close_finish;
3443                 /*
3444                  * Do manual async queue here to avoid grabbing files - we don't
3445                  * need the files, and it'll cause io_close_finish() to close
3446                  * the file again and cause a double CQE entry for this request
3447                  */
3448                 io_queue_async_work(req);
3449                 return 0;
3450         }
3451
3452         /*
3453          * No ->flush(), safely close from here and just punt the
3454          * fput() to async context.
3455          */
3456         __io_close_finish(req);
3457         return 0;
3458 }
3459
3460 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3461 {
3462         struct io_ring_ctx *ctx = req->ctx;
3463
3464         if (!req->file)
3465                 return -EBADF;
3466
3467         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3468                 return -EINVAL;
3469         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3470                 return -EINVAL;
3471
3472         req->sync.off = READ_ONCE(sqe->off);
3473         req->sync.len = READ_ONCE(sqe->len);
3474         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
3475         return 0;
3476 }
3477
3478 static void __io_sync_file_range(struct io_kiocb *req)
3479 {
3480         int ret;
3481
3482         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
3483                                 req->sync.flags);
3484         if (ret < 0)
3485                 req_set_fail_links(req);
3486         io_cqring_add_event(req, ret);
3487         io_put_req(req);
3488 }
3489
3490
3491 static void io_sync_file_range_finish(struct io_wq_work **workptr)
3492 {
3493         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3494
3495         if (io_req_cancelled(req))
3496                 return;
3497         __io_sync_file_range(req);
3498         io_steal_work(req, workptr);
3499 }
3500
3501 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
3502 {
3503         /* sync_file_range always requires a blocking context */
3504         if (force_nonblock) {
3505                 req->work.func = io_sync_file_range_finish;
3506                 return -EAGAIN;
3507         }
3508
3509         __io_sync_file_range(req);
3510         return 0;
3511 }
3512
3513 #if defined(CONFIG_NET)
3514 static int io_setup_async_msg(struct io_kiocb *req,
3515                               struct io_async_msghdr *kmsg)
3516 {
3517         if (req->io)
3518                 return -EAGAIN;
3519         if (io_alloc_async_ctx(req)) {
3520                 if (kmsg->iov != kmsg->fast_iov)
3521                         kfree(kmsg->iov);
3522                 return -ENOMEM;
3523         }
3524         req->flags |= REQ_F_NEED_CLEANUP;
3525         memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
3526         return -EAGAIN;
3527 }
3528
3529 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3530 {
3531         struct io_sr_msg *sr = &req->sr_msg;
3532         struct io_async_ctx *io = req->io;
3533         int ret;
3534
3535         sr->msg_flags = READ_ONCE(sqe->msg_flags);
3536         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3537         sr->len = READ_ONCE(sqe->len);
3538
3539 #ifdef CONFIG_COMPAT
3540         if (req->ctx->compat)
3541                 sr->msg_flags |= MSG_CMSG_COMPAT;
3542 #endif
3543
3544         if (!io || req->opcode == IORING_OP_SEND)
3545                 return 0;
3546         /* iovec is already imported */
3547         if (req->flags & REQ_F_NEED_CLEANUP)
3548                 return 0;
3549
3550         io->msg.iov = io->msg.fast_iov;
3551         ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
3552                                         &io->msg.iov);
3553         if (!ret)
3554                 req->flags |= REQ_F_NEED_CLEANUP;
3555         return ret;
3556 }
3557
3558 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
3559 {
3560         struct io_async_msghdr *kmsg = NULL;
3561         struct socket *sock;
3562         int ret;
3563
3564         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3565                 return -EINVAL;
3566
3567         sock = sock_from_file(req->file, &ret);
3568         if (sock) {
3569                 struct io_async_ctx io;
3570                 unsigned flags;
3571
3572                 if (req->io) {
3573                         kmsg = &req->io->msg;
3574                         kmsg->msg.msg_name = &req->io->msg.addr;
3575                         /* if iov is set, it's allocated already */
3576                         if (!kmsg->iov)
3577                                 kmsg->iov = kmsg->fast_iov;
3578                         kmsg->msg.msg_iter.iov = kmsg->iov;
3579                 } else {
3580                         struct io_sr_msg *sr = &req->sr_msg;
3581
3582                         kmsg = &io.msg;
3583                         kmsg->msg.msg_name = &io.msg.addr;
3584
3585                         io.msg.iov = io.msg.fast_iov;
3586                         ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3587                                         sr->msg_flags, &io.msg.iov);
3588                         if (ret)
3589                                 return ret;
3590                 }
3591
3592                 flags = req->sr_msg.msg_flags;
3593                 if (flags & MSG_DONTWAIT)
3594                         req->flags |= REQ_F_NOWAIT;
3595                 else if (force_nonblock)
3596                         flags |= MSG_DONTWAIT;
3597
3598                 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
3599                 if (force_nonblock && ret == -EAGAIN)
3600                         return io_setup_async_msg(req, kmsg);
3601                 if (ret == -ERESTARTSYS)
3602                         ret = -EINTR;
3603         }
3604
3605         if (kmsg && kmsg->iov != kmsg->fast_iov)
3606                 kfree(kmsg->iov);
3607         req->flags &= ~REQ_F_NEED_CLEANUP;
3608         io_cqring_add_event(req, ret);
3609         if (ret < 0)
3610                 req_set_fail_links(req);
3611         io_put_req(req);
3612         return 0;
3613 }
3614
3615 static int io_send(struct io_kiocb *req, bool force_nonblock)
3616 {
3617         struct socket *sock;
3618         int ret;
3619
3620         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3621                 return -EINVAL;
3622
3623         sock = sock_from_file(req->file, &ret);
3624         if (sock) {
3625                 struct io_sr_msg *sr = &req->sr_msg;
3626                 struct msghdr msg;
3627                 struct iovec iov;
3628                 unsigned flags;
3629
3630                 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3631                                                 &msg.msg_iter);
3632                 if (ret)
3633                         return ret;
3634
3635                 msg.msg_name = NULL;
3636                 msg.msg_control = NULL;
3637                 msg.msg_controllen = 0;
3638                 msg.msg_namelen = 0;
3639
3640                 flags = req->sr_msg.msg_flags;
3641                 if (flags & MSG_DONTWAIT)
3642                         req->flags |= REQ_F_NOWAIT;
3643                 else if (force_nonblock)
3644                         flags |= MSG_DONTWAIT;
3645
3646                 msg.msg_flags = flags;
3647                 ret = sock_sendmsg(sock, &msg);
3648                 if (force_nonblock && ret == -EAGAIN)
3649                         return -EAGAIN;
3650                 if (ret == -ERESTARTSYS)
3651                         ret = -EINTR;
3652         }
3653
3654         io_cqring_add_event(req, ret);
3655         if (ret < 0)
3656                 req_set_fail_links(req);
3657         io_put_req(req);
3658         return 0;
3659 }
3660
3661 static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3662 {
3663         struct io_sr_msg *sr = &req->sr_msg;
3664         struct iovec __user *uiov;
3665         size_t iov_len;
3666         int ret;
3667
3668         ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
3669                                         &uiov, &iov_len);
3670         if (ret)
3671                 return ret;
3672
3673         if (req->flags & REQ_F_BUFFER_SELECT) {
3674                 if (iov_len > 1)
3675                         return -EINVAL;
3676                 if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
3677                         return -EFAULT;
3678                 sr->len = io->msg.iov[0].iov_len;
3679                 iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
3680                                 sr->len);
3681                 io->msg.iov = NULL;
3682         } else {
3683                 ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
3684                                         &io->msg.iov, &io->msg.msg.msg_iter);
3685                 if (ret > 0)
3686                         ret = 0;
3687         }
3688
3689         return ret;
3690 }
3691
3692 #ifdef CONFIG_COMPAT
3693 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
3694                                         struct io_async_ctx *io)
3695 {
3696         struct compat_msghdr __user *msg_compat;
3697         struct io_sr_msg *sr = &req->sr_msg;
3698         struct compat_iovec __user *uiov;
3699         compat_uptr_t ptr;
3700         compat_size_t len;
3701         int ret;
3702
3703         msg_compat = (struct compat_msghdr __user *) sr->msg;
3704         ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
3705                                         &ptr, &len);
3706         if (ret)
3707                 return ret;
3708
3709         uiov = compat_ptr(ptr);
3710         if (req->flags & REQ_F_BUFFER_SELECT) {
3711                 compat_ssize_t clen;
3712
3713                 if (len > 1)
3714                         return -EINVAL;
3715                 if (!access_ok(uiov, sizeof(*uiov)))
3716                         return -EFAULT;
3717                 if (__get_user(clen, &uiov->iov_len))
3718                         return -EFAULT;
3719                 if (clen < 0)
3720                         return -EINVAL;
3721                 sr->len = io->msg.iov[0].iov_len;
3722                 io->msg.iov = NULL;
3723         } else {
3724                 ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
3725                                                 &io->msg.iov,
3726                                                 &io->msg.msg.msg_iter);
3727                 if (ret < 0)
3728                         return ret;
3729         }
3730
3731         return 0;
3732 }
3733 #endif
3734
3735 static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3736 {
3737         io->msg.iov = io->msg.fast_iov;
3738
3739 #ifdef CONFIG_COMPAT
3740         if (req->ctx->compat)
3741                 return __io_compat_recvmsg_copy_hdr(req, io);
3742 #endif
3743
3744         return __io_recvmsg_copy_hdr(req, io);
3745 }
3746
3747 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
3748                                                int *cflags, bool needs_lock)
3749 {
3750         struct io_sr_msg *sr = &req->sr_msg;
3751         struct io_buffer *kbuf;
3752
3753         if (!(req->flags & REQ_F_BUFFER_SELECT))
3754                 return NULL;
3755
3756         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
3757         if (IS_ERR(kbuf))
3758                 return kbuf;
3759
3760         sr->kbuf = kbuf;
3761         req->flags |= REQ_F_BUFFER_SELECTED;
3762
3763         *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
3764         *cflags |= IORING_CQE_F_BUFFER;
3765         return kbuf;
3766 }
3767
3768 static int io_recvmsg_prep(struct io_kiocb *req,
3769                            const struct io_uring_sqe *sqe)
3770 {
3771         struct io_sr_msg *sr = &req->sr_msg;
3772         struct io_async_ctx *io = req->io;
3773         int ret;
3774
3775         sr->msg_flags = READ_ONCE(sqe->msg_flags);
3776         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3777         sr->len = READ_ONCE(sqe->len);
3778         sr->bgid = READ_ONCE(sqe->buf_group);
3779
3780 #ifdef CONFIG_COMPAT
3781         if (req->ctx->compat)
3782                 sr->msg_flags |= MSG_CMSG_COMPAT;
3783 #endif
3784
3785         if (!io || req->opcode == IORING_OP_RECV)
3786                 return 0;
3787         /* iovec is already imported */
3788         if (req->flags & REQ_F_NEED_CLEANUP)
3789                 return 0;
3790
3791         ret = io_recvmsg_copy_hdr(req, io);
3792         if (!ret)
3793                 req->flags |= REQ_F_NEED_CLEANUP;
3794         return ret;
3795 }
3796
3797 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
3798 {
3799         struct io_async_msghdr *kmsg = NULL;
3800         struct socket *sock;
3801         int ret, cflags = 0;
3802
3803         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3804                 return -EINVAL;
3805
3806         sock = sock_from_file(req->file, &ret);
3807         if (sock) {
3808                 struct io_buffer *kbuf;
3809                 struct io_async_ctx io;
3810                 unsigned flags;
3811
3812                 if (req->io) {
3813                         kmsg = &req->io->msg;
3814                         kmsg->msg.msg_name = &req->io->msg.addr;
3815                         /* if iov is set, it's allocated already */
3816                         if (!kmsg->iov)
3817                                 kmsg->iov = kmsg->fast_iov;
3818                         kmsg->msg.msg_iter.iov = kmsg->iov;
3819                 } else {
3820                         kmsg = &io.msg;
3821                         kmsg->msg.msg_name = &io.msg.addr;
3822
3823                         ret = io_recvmsg_copy_hdr(req, &io);
3824                         if (ret)
3825                                 return ret;
3826                 }
3827
3828                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3829                 if (IS_ERR(kbuf)) {
3830                         return PTR_ERR(kbuf);
3831                 } else if (kbuf) {
3832                         kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3833                         iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
3834                                         1, req->sr_msg.len);
3835                 }
3836
3837                 flags = req->sr_msg.msg_flags;
3838                 if (flags & MSG_DONTWAIT)
3839                         req->flags |= REQ_F_NOWAIT;
3840                 else if (force_nonblock)
3841                         flags |= MSG_DONTWAIT;
3842
3843                 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3844                                                 kmsg->uaddr, flags);
3845                 if (force_nonblock && ret == -EAGAIN)
3846                         return io_setup_async_msg(req, kmsg);
3847                 if (ret == -ERESTARTSYS)
3848                         ret = -EINTR;
3849         }
3850
3851         if (kmsg && kmsg->iov != kmsg->fast_iov)
3852                 kfree(kmsg->iov);
3853         req->flags &= ~REQ_F_NEED_CLEANUP;
3854         __io_cqring_add_event(req, ret, cflags);
3855         if (ret < 0)
3856                 req_set_fail_links(req);
3857         io_put_req(req);
3858         return 0;
3859 }
3860
3861 static int io_recv(struct io_kiocb *req, bool force_nonblock)
3862 {
3863         struct io_buffer *kbuf = NULL;
3864         struct socket *sock;
3865         int ret, cflags = 0;
3866
3867         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3868                 return -EINVAL;
3869
3870         sock = sock_from_file(req->file, &ret);
3871         if (sock) {
3872                 struct io_sr_msg *sr = &req->sr_msg;
3873                 void __user *buf = sr->buf;
3874                 struct msghdr msg;
3875                 struct iovec iov;
3876                 unsigned flags;
3877
3878                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3879                 if (IS_ERR(kbuf))
3880                         return PTR_ERR(kbuf);
3881                 else if (kbuf)
3882                         buf = u64_to_user_ptr(kbuf->addr);
3883
3884                 ret = import_single_range(READ, buf, sr->len, &iov,
3885                                                 &msg.msg_iter);
3886                 if (ret) {
3887                         kfree(kbuf);
3888                         return ret;
3889                 }
3890
3891                 req->flags |= REQ_F_NEED_CLEANUP;
3892                 msg.msg_name = NULL;
3893                 msg.msg_control = NULL;
3894                 msg.msg_controllen = 0;
3895                 msg.msg_namelen = 0;
3896                 msg.msg_iocb = NULL;
3897                 msg.msg_flags = 0;
3898
3899                 flags = req->sr_msg.msg_flags;
3900                 if (flags & MSG_DONTWAIT)
3901                         req->flags |= REQ_F_NOWAIT;
3902                 else if (force_nonblock)
3903                         flags |= MSG_DONTWAIT;
3904
3905                 ret = sock_recvmsg(sock, &msg, flags);
3906                 if (force_nonblock && ret == -EAGAIN)
3907                         return -EAGAIN;
3908                 if (ret == -ERESTARTSYS)
3909                         ret = -EINTR;
3910         }
3911
3912         kfree(kbuf);
3913         req->flags &= ~REQ_F_NEED_CLEANUP;
3914         __io_cqring_add_event(req, ret, cflags);
3915         if (ret < 0)
3916                 req_set_fail_links(req);
3917         io_put_req(req);
3918         return 0;
3919 }
3920
3921 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3922 {
3923         struct io_accept *accept = &req->accept;
3924
3925         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3926                 return -EINVAL;
3927         if (sqe->ioprio || sqe->len || sqe->buf_index)
3928                 return -EINVAL;
3929
3930         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3931         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3932         accept->flags = READ_ONCE(sqe->accept_flags);
3933         accept->nofile = rlimit(RLIMIT_NOFILE);
3934         return 0;
3935 }
3936
3937 static int __io_accept(struct io_kiocb *req, bool force_nonblock)
3938 {
3939         struct io_accept *accept = &req->accept;
3940         unsigned file_flags;
3941         int ret;
3942
3943         file_flags = force_nonblock ? O_NONBLOCK : 0;
3944         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3945                                         accept->addr_len, accept->flags,
3946                                         accept->nofile);
3947         if (ret == -EAGAIN && force_nonblock)
3948                 return -EAGAIN;
3949         if (ret == -ERESTARTSYS)
3950                 ret = -EINTR;
3951         if (ret < 0)
3952                 req_set_fail_links(req);
3953         io_cqring_add_event(req, ret);
3954         io_put_req(req);
3955         return 0;
3956 }
3957
3958 static void io_accept_finish(struct io_wq_work **workptr)
3959 {
3960         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
3961
3962         if (io_req_cancelled(req))
3963                 return;
3964         __io_accept(req, false);
3965         io_steal_work(req, workptr);
3966 }
3967
3968 static int io_accept(struct io_kiocb *req, bool force_nonblock)
3969 {
3970         int ret;
3971
3972         ret = __io_accept(req, force_nonblock);
3973         if (ret == -EAGAIN && force_nonblock) {
3974                 req->work.func = io_accept_finish;
3975                 return -EAGAIN;
3976         }
3977         return 0;
3978 }
3979
3980 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3981 {
3982         struct io_connect *conn = &req->connect;
3983         struct io_async_ctx *io = req->io;
3984
3985         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3986                 return -EINVAL;
3987         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3988                 return -EINVAL;
3989
3990         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3991         conn->addr_len =  READ_ONCE(sqe->addr2);
3992
3993         if (!io)
3994                 return 0;
3995
3996         return move_addr_to_kernel(conn->addr, conn->addr_len,
3997                                         &io->connect.address);
3998 }
3999
4000 static int io_connect(struct io_kiocb *req, bool force_nonblock)
4001 {
4002         struct io_async_ctx __io, *io;
4003         unsigned file_flags;
4004         int ret;
4005
4006         if (req->io) {
4007                 io = req->io;
4008         } else {
4009                 ret = move_addr_to_kernel(req->connect.addr,
4010                                                 req->connect.addr_len,
4011                                                 &__io.connect.address);
4012                 if (ret)
4013                         goto out;
4014                 io = &__io;
4015         }
4016
4017         file_flags = force_nonblock ? O_NONBLOCK : 0;
4018
4019         ret = __sys_connect_file(req->file, &io->connect.address,
4020                                         req->connect.addr_len, file_flags);
4021         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4022                 if (req->io)
4023                         return -EAGAIN;
4024                 if (io_alloc_async_ctx(req)) {
4025                         ret = -ENOMEM;
4026                         goto out;
4027                 }
4028                 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
4029                 return -EAGAIN;
4030         }
4031         if (ret == -ERESTARTSYS)
4032                 ret = -EINTR;
4033 out:
4034         if (ret < 0)
4035                 req_set_fail_links(req);
4036         io_cqring_add_event(req, ret);
4037         io_put_req(req);
4038         return 0;
4039 }
4040 #else /* !CONFIG_NET */
4041 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4042 {
4043         return -EOPNOTSUPP;
4044 }
4045
4046 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
4047 {
4048         return -EOPNOTSUPP;
4049 }
4050
4051 static int io_send(struct io_kiocb *req, bool force_nonblock)
4052 {
4053         return -EOPNOTSUPP;
4054 }
4055
4056 static int io_recvmsg_prep(struct io_kiocb *req,
4057                            const struct io_uring_sqe *sqe)
4058 {
4059         return -EOPNOTSUPP;
4060 }
4061
4062 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
4063 {
4064         return -EOPNOTSUPP;
4065 }
4066
4067 static int io_recv(struct io_kiocb *req, bool force_nonblock)
4068 {
4069         return -EOPNOTSUPP;
4070 }
4071
4072 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4073 {
4074         return -EOPNOTSUPP;
4075 }
4076
4077 static int io_accept(struct io_kiocb *req, bool force_nonblock)
4078 {
4079         return -EOPNOTSUPP;
4080 }
4081
4082 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4083 {
4084         return -EOPNOTSUPP;
4085 }
4086
4087 static int io_connect(struct io_kiocb *req, bool force_nonblock)
4088 {
4089         return -EOPNOTSUPP;
4090 }
4091 #endif /* CONFIG_NET */
4092
4093 struct io_poll_table {
4094         struct poll_table_struct pt;
4095         struct io_kiocb *req;
4096         int error;
4097 };
4098
4099 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4100                             struct wait_queue_head *head)
4101 {
4102         if (unlikely(poll->head)) {
4103                 pt->error = -EINVAL;
4104                 return;
4105         }
4106
4107         pt->error = 0;
4108         poll->head = head;
4109         add_wait_queue(head, &poll->wait);
4110 }
4111
4112 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4113                                struct poll_table_struct *p)
4114 {
4115         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4116
4117         __io_queue_proc(&pt->req->apoll->poll, pt, head);
4118 }
4119
4120 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4121                            __poll_t mask, task_work_func_t func)
4122 {
4123         struct task_struct *tsk;
4124         int ret;
4125
4126         /* for instances that support it check for an event match first: */
4127         if (mask && !(mask & poll->events))
4128                 return 0;
4129
4130         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4131
4132         list_del_init(&poll->wait.entry);
4133
4134         tsk = req->task;
4135         req->result = mask;
4136         init_task_work(&req->task_work, func);
4137         /*
4138          * If this fails, then the task is exiting. When a task exits, the
4139          * work gets canceled, so just cancel this request as well instead
4140          * of executing it. We can't safely execute it anyway, as we may not
4141          * have the needed state needed for it anyway.
4142          */
4143         ret = task_work_add(tsk, &req->task_work, true);
4144         if (unlikely(ret)) {
4145                 WRITE_ONCE(poll->canceled, true);
4146                 tsk = io_wq_get_task(req->ctx->io_wq);
4147                 task_work_add(tsk, &req->task_work, true);
4148         }
4149         wake_up_process(tsk);
4150         return 1;
4151 }
4152
4153 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4154         __acquires(&req->ctx->completion_lock)
4155 {
4156         struct io_ring_ctx *ctx = req->ctx;
4157
4158         if (!req->result && !READ_ONCE(poll->canceled)) {
4159                 struct poll_table_struct pt = { ._key = poll->events };
4160
4161                 req->result = vfs_poll(req->file, &pt) & poll->events;
4162         }
4163
4164         spin_lock_irq(&ctx->completion_lock);
4165         if (!req->result && !READ_ONCE(poll->canceled)) {
4166                 add_wait_queue(poll->head, &poll->wait);
4167                 return true;
4168         }
4169
4170         return false;
4171 }
4172
4173 static void io_async_task_func(struct callback_head *cb)
4174 {
4175         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4176         struct async_poll *apoll = req->apoll;
4177         struct io_ring_ctx *ctx = req->ctx;
4178         bool canceled;
4179
4180         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
4181
4182         if (io_poll_rewait(req, &apoll->poll)) {
4183                 spin_unlock_irq(&ctx->completion_lock);
4184                 return;
4185         }
4186
4187         if (hash_hashed(&req->hash_node))
4188                 hash_del(&req->hash_node);
4189
4190         canceled = READ_ONCE(apoll->poll.canceled);
4191         if (canceled) {
4192                 io_cqring_fill_event(req, -ECANCELED);
4193                 io_commit_cqring(ctx);
4194         }
4195
4196         spin_unlock_irq(&ctx->completion_lock);
4197
4198         /* restore ->work in case we need to retry again */
4199         memcpy(&req->work, &apoll->work, sizeof(req->work));
4200
4201         if (canceled) {
4202                 kfree(apoll);
4203                 io_cqring_ev_posted(ctx);
4204                 req_set_fail_links(req);
4205                 io_double_put_req(req);
4206                 return;
4207         }
4208
4209         __set_current_state(TASK_RUNNING);
4210         mutex_lock(&ctx->uring_lock);
4211         __io_queue_sqe(req, NULL);
4212         mutex_unlock(&ctx->uring_lock);
4213
4214         kfree(apoll);
4215 }
4216
4217 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4218                         void *key)
4219 {
4220         struct io_kiocb *req = wait->private;
4221         struct io_poll_iocb *poll = &req->apoll->poll;
4222
4223         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
4224                                         key_to_poll(key));
4225
4226         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
4227 }
4228
4229 static void io_poll_req_insert(struct io_kiocb *req)
4230 {
4231         struct io_ring_ctx *ctx = req->ctx;
4232         struct hlist_head *list;
4233
4234         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
4235         hlist_add_head(&req->hash_node, list);
4236 }
4237
4238 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
4239                                       struct io_poll_iocb *poll,
4240                                       struct io_poll_table *ipt, __poll_t mask,
4241                                       wait_queue_func_t wake_func)
4242         __acquires(&ctx->completion_lock)
4243 {
4244         struct io_ring_ctx *ctx = req->ctx;
4245         bool cancel = false;
4246
4247         poll->file = req->file;
4248         poll->head = NULL;
4249         poll->done = poll->canceled = false;
4250         poll->events = mask;
4251
4252         ipt->pt._key = mask;
4253         ipt->req = req;
4254         ipt->error = -EINVAL;
4255
4256         INIT_LIST_HEAD(&poll->wait.entry);
4257         init_waitqueue_func_entry(&poll->wait, wake_func);
4258         poll->wait.private = req;
4259
4260         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
4261
4262         spin_lock_irq(&ctx->completion_lock);
4263         if (likely(poll->head)) {
4264                 spin_lock(&poll->head->lock);
4265                 if (unlikely(list_empty(&poll->wait.entry))) {
4266                         if (ipt->error)
4267                                 cancel = true;
4268                         ipt->error = 0;
4269                         mask = 0;
4270                 }
4271                 if (mask || ipt->error)
4272                         list_del_init(&poll->wait.entry);
4273                 else if (cancel)
4274                         WRITE_ONCE(poll->canceled, true);
4275                 else if (!poll->done) /* actually waiting for an event */
4276                         io_poll_req_insert(req);
4277                 spin_unlock(&poll->head->lock);
4278         }
4279
4280         return mask;
4281 }
4282
4283 static bool io_arm_poll_handler(struct io_kiocb *req)
4284 {
4285         const struct io_op_def *def = &io_op_defs[req->opcode];
4286         struct io_ring_ctx *ctx = req->ctx;
4287         struct async_poll *apoll;
4288         struct io_poll_table ipt;
4289         __poll_t mask, ret;
4290
4291         if (!req->file || !file_can_poll(req->file))
4292                 return false;
4293         if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
4294                 return false;
4295         if (!def->pollin && !def->pollout)
4296                 return false;
4297
4298         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
4299         if (unlikely(!apoll))
4300                 return false;
4301
4302         req->flags |= REQ_F_POLLED;
4303         memcpy(&apoll->work, &req->work, sizeof(req->work));
4304
4305         get_task_struct(current);
4306         req->task = current;
4307         req->apoll = apoll;
4308         INIT_HLIST_NODE(&req->hash_node);
4309
4310         mask = 0;
4311         if (def->pollin)
4312                 mask |= POLLIN | POLLRDNORM;
4313         if (def->pollout)
4314                 mask |= POLLOUT | POLLWRNORM;
4315         mask |= POLLERR | POLLPRI;
4316
4317         ipt.pt._qproc = io_async_queue_proc;
4318
4319         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
4320                                         io_async_wake);
4321         if (ret) {
4322                 ipt.error = 0;
4323                 apoll->poll.done = true;
4324                 spin_unlock_irq(&ctx->completion_lock);
4325                 memcpy(&req->work, &apoll->work, sizeof(req->work));
4326                 kfree(apoll);
4327                 return false;
4328         }
4329         spin_unlock_irq(&ctx->completion_lock);
4330         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
4331                                         apoll->poll.events);
4332         return true;
4333 }
4334
4335 static bool __io_poll_remove_one(struct io_kiocb *req,
4336                                  struct io_poll_iocb *poll)
4337 {
4338         bool do_complete = false;
4339
4340         spin_lock(&poll->head->lock);
4341         WRITE_ONCE(poll->canceled, true);
4342         if (!list_empty(&poll->wait.entry)) {
4343                 list_del_init(&poll->wait.entry);
4344                 do_complete = true;
4345         }
4346         spin_unlock(&poll->head->lock);
4347         return do_complete;
4348 }
4349
4350 static bool io_poll_remove_one(struct io_kiocb *req)
4351 {
4352         struct async_poll *apoll = NULL;
4353         bool do_complete;
4354
4355         if (req->opcode == IORING_OP_POLL_ADD) {
4356                 do_complete = __io_poll_remove_one(req, &req->poll);
4357         } else {
4358                 apoll = req->apoll;
4359                 /* non-poll requests have submit ref still */
4360                 do_complete = __io_poll_remove_one(req, &req->apoll->poll);
4361                 if (do_complete)
4362                         io_put_req(req);
4363         }
4364
4365         hash_del(&req->hash_node);
4366
4367         if (do_complete && apoll) {
4368                 /*
4369                  * restore ->work because we need to call io_req_work_drop_env.
4370                  */
4371                 memcpy(&req->work, &apoll->work, sizeof(req->work));
4372                 kfree(apoll);
4373         }
4374
4375         if (do_complete) {
4376                 io_cqring_fill_event(req, -ECANCELED);
4377                 io_commit_cqring(req->ctx);
4378                 req->flags |= REQ_F_COMP_LOCKED;
4379                 io_put_req(req);
4380         }
4381
4382         return do_complete;
4383 }
4384
4385 static void io_poll_remove_all(struct io_ring_ctx *ctx)
4386 {
4387         struct hlist_node *tmp;
4388         struct io_kiocb *req;
4389         int posted = 0, i;
4390
4391         spin_lock_irq(&ctx->completion_lock);
4392         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
4393                 struct hlist_head *list;
4394
4395                 list = &ctx->cancel_hash[i];
4396                 hlist_for_each_entry_safe(req, tmp, list, hash_node)
4397                         posted += io_poll_remove_one(req);
4398         }
4399         spin_unlock_irq(&ctx->completion_lock);
4400
4401         if (posted)
4402                 io_cqring_ev_posted(ctx);
4403 }
4404
4405 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
4406 {
4407         struct hlist_head *list;
4408         struct io_kiocb *req;
4409
4410         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
4411         hlist_for_each_entry(req, list, hash_node) {
4412                 if (sqe_addr != req->user_data)
4413                         continue;
4414                 if (io_poll_remove_one(req))
4415                         return 0;
4416                 return -EALREADY;
4417         }
4418
4419         return -ENOENT;
4420 }
4421
4422 static int io_poll_remove_prep(struct io_kiocb *req,
4423                                const struct io_uring_sqe *sqe)
4424 {
4425         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4426                 return -EINVAL;
4427         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4428             sqe->poll_events)
4429                 return -EINVAL;
4430
4431         req->poll.addr = READ_ONCE(sqe->addr);
4432         return 0;
4433 }
4434
4435 /*
4436  * Find a running poll command that matches one specified in sqe->addr,
4437  * and remove it if found.
4438  */
4439 static int io_poll_remove(struct io_kiocb *req)
4440 {
4441         struct io_ring_ctx *ctx = req->ctx;
4442         u64 addr;
4443         int ret;
4444
4445         addr = req->poll.addr;
4446         spin_lock_irq(&ctx->completion_lock);
4447         ret = io_poll_cancel(ctx, addr);
4448         spin_unlock_irq(&ctx->completion_lock);
4449
4450         io_cqring_add_event(req, ret);
4451         if (ret < 0)
4452                 req_set_fail_links(req);
4453         io_put_req(req);
4454         return 0;
4455 }
4456
4457 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4458 {
4459         struct io_ring_ctx *ctx = req->ctx;
4460
4461         req->poll.done = true;
4462         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4463         io_commit_cqring(ctx);
4464 }
4465
4466 static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
4467 {
4468         struct io_ring_ctx *ctx = req->ctx;
4469         struct io_poll_iocb *poll = &req->poll;
4470
4471         if (io_poll_rewait(req, poll)) {
4472                 spin_unlock_irq(&ctx->completion_lock);
4473                 return;
4474         }
4475
4476         hash_del(&req->hash_node);
4477         io_poll_complete(req, req->result, 0);
4478         req->flags |= REQ_F_COMP_LOCKED;
4479         io_put_req_find_next(req, nxt);
4480         spin_unlock_irq(&ctx->completion_lock);
4481
4482         io_cqring_ev_posted(ctx);
4483 }
4484
4485 static void io_poll_task_func(struct callback_head *cb)
4486 {
4487         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4488         struct io_kiocb *nxt = NULL;
4489
4490         io_poll_task_handler(req, &nxt);
4491         if (nxt) {
4492                 struct io_ring_ctx *ctx = nxt->ctx;
4493
4494                 mutex_lock(&ctx->uring_lock);
4495                 __io_queue_sqe(nxt, NULL);
4496                 mutex_unlock(&ctx->uring_lock);
4497         }
4498 }
4499
4500 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4501                         void *key)
4502 {
4503         struct io_kiocb *req = wait->private;
4504         struct io_poll_iocb *poll = &req->poll;
4505
4506         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
4507 }
4508
4509 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
4510                                struct poll_table_struct *p)
4511 {
4512         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4513
4514         __io_queue_proc(&pt->req->poll, pt, head);
4515 }
4516
4517 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4518 {
4519         struct io_poll_iocb *poll = &req->poll;
4520         u16 events;
4521
4522         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4523                 return -EINVAL;
4524         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
4525                 return -EINVAL;
4526         if (!poll->file)
4527                 return -EBADF;
4528
4529         events = READ_ONCE(sqe->poll_events);
4530         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
4531
4532         get_task_struct(current);
4533         req->task = current;
4534         return 0;
4535 }
4536
4537 static int io_poll_add(struct io_kiocb *req)
4538 {
4539         struct io_poll_iocb *poll = &req->poll;
4540         struct io_ring_ctx *ctx = req->ctx;
4541         struct io_poll_table ipt;
4542         __poll_t mask;
4543
4544         INIT_HLIST_NODE(&req->hash_node);
4545         INIT_LIST_HEAD(&req->list);
4546         ipt.pt._qproc = io_poll_queue_proc;
4547
4548         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
4549                                         io_poll_wake);
4550
4551         if (mask) { /* no async, we'd stolen it */
4552                 ipt.error = 0;
4553                 io_poll_complete(req, mask, 0);
4554         }
4555         spin_unlock_irq(&ctx->completion_lock);
4556
4557         if (mask) {
4558                 io_cqring_ev_posted(ctx);
4559                 io_put_req(req);
4560         }
4561         return ipt.error;
4562 }
4563
4564 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
4565 {
4566         struct io_timeout_data *data = container_of(timer,
4567                                                 struct io_timeout_data, timer);
4568         struct io_kiocb *req = data->req;
4569         struct io_ring_ctx *ctx = req->ctx;
4570         unsigned long flags;
4571
4572         atomic_inc(&ctx->cq_timeouts);
4573
4574         spin_lock_irqsave(&ctx->completion_lock, flags);
4575         /*
4576          * We could be racing with timeout deletion. If the list is empty,
4577          * then timeout lookup already found it and will be handling it.
4578          */
4579         if (!list_empty(&req->list)) {
4580                 struct io_kiocb *prev;
4581
4582                 /*
4583                  * Adjust the reqs sequence before the current one because it
4584                  * will consume a slot in the cq_ring and the cq_tail
4585                  * pointer will be increased, otherwise other timeout reqs may
4586                  * return in advance without waiting for enough wait_nr.
4587                  */
4588                 prev = req;
4589                 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
4590                         prev->sequence++;
4591                 list_del_init(&req->list);
4592         }
4593
4594         io_cqring_fill_event(req, -ETIME);
4595         io_commit_cqring(ctx);
4596         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4597
4598         io_cqring_ev_posted(ctx);
4599         req_set_fail_links(req);
4600         io_put_req(req);
4601         return HRTIMER_NORESTART;
4602 }
4603
4604 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
4605 {
4606         struct io_kiocb *req;
4607         int ret = -ENOENT;
4608
4609         list_for_each_entry(req, &ctx->timeout_list, list) {
4610                 if (user_data == req->user_data) {
4611                         list_del_init(&req->list);
4612                         ret = 0;
4613                         break;
4614                 }
4615         }
4616
4617         if (ret == -ENOENT)
4618                 return ret;
4619
4620         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
4621         if (ret == -1)
4622                 return -EALREADY;
4623
4624         req_set_fail_links(req);
4625         io_cqring_fill_event(req, -ECANCELED);
4626         io_put_req(req);
4627         return 0;
4628 }
4629
4630 static int io_timeout_remove_prep(struct io_kiocb *req,
4631                                   const struct io_uring_sqe *sqe)
4632 {
4633         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4634                 return -EINVAL;
4635         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
4636                 return -EINVAL;
4637
4638         req->timeout.addr = READ_ONCE(sqe->addr);
4639         req->timeout.flags = READ_ONCE(sqe->timeout_flags);
4640         if (req->timeout.flags)
4641                 return -EINVAL;
4642
4643         return 0;
4644 }
4645
4646 /*
4647  * Remove or update an existing timeout command
4648  */
4649 static int io_timeout_remove(struct io_kiocb *req)
4650 {
4651         struct io_ring_ctx *ctx = req->ctx;
4652         int ret;
4653
4654         spin_lock_irq(&ctx->completion_lock);
4655         ret = io_timeout_cancel(ctx, req->timeout.addr);
4656
4657         io_cqring_fill_event(req, ret);
4658         io_commit_cqring(ctx);
4659         spin_unlock_irq(&ctx->completion_lock);
4660         io_cqring_ev_posted(ctx);
4661         if (ret < 0)
4662                 req_set_fail_links(req);
4663         io_put_req(req);
4664         return 0;
4665 }
4666
4667 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4668                            bool is_timeout_link)
4669 {
4670         struct io_timeout_data *data;
4671         unsigned flags;
4672
4673         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4674                 return -EINVAL;
4675         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
4676                 return -EINVAL;
4677         if (sqe->off && is_timeout_link)
4678                 return -EINVAL;
4679         flags = READ_ONCE(sqe->timeout_flags);
4680         if (flags & ~IORING_TIMEOUT_ABS)
4681                 return -EINVAL;
4682
4683         req->timeout.count = READ_ONCE(sqe->off);
4684
4685         if (!req->io && io_alloc_async_ctx(req))
4686                 return -ENOMEM;
4687
4688         data = &req->io->timeout;
4689         data->req = req;
4690         req->flags |= REQ_F_TIMEOUT;
4691
4692         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
4693                 return -EFAULT;
4694
4695         if (flags & IORING_TIMEOUT_ABS)
4696                 data->mode = HRTIMER_MODE_ABS;
4697         else
4698                 data->mode = HRTIMER_MODE_REL;
4699
4700         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
4701         return 0;
4702 }
4703
4704 static int io_timeout(struct io_kiocb *req)
4705 {
4706         struct io_ring_ctx *ctx = req->ctx;
4707         struct io_timeout_data *data;
4708         struct list_head *entry;
4709         unsigned span = 0;
4710         u32 count = req->timeout.count;
4711         u32 seq = req->sequence;
4712
4713         data = &req->io->timeout;
4714
4715         /*
4716          * sqe->off holds how many events that need to occur for this
4717          * timeout event to be satisfied. If it isn't set, then this is
4718          * a pure timeout request, sequence isn't used.
4719          */
4720         if (!count) {
4721                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
4722                 spin_lock_irq(&ctx->completion_lock);
4723                 entry = ctx->timeout_list.prev;
4724                 goto add;
4725         }
4726
4727         req->sequence = seq + count;
4728
4729         /*
4730          * Insertion sort, ensuring the first entry in the list is always
4731          * the one we need first.
4732          */
4733         spin_lock_irq(&ctx->completion_lock);
4734         list_for_each_prev(entry, &ctx->timeout_list) {
4735                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
4736                 unsigned nxt_seq;
4737                 long long tmp, tmp_nxt;
4738                 u32 nxt_offset = nxt->timeout.count;
4739
4740                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
4741                         continue;
4742
4743                 /*
4744                  * Since seq + count can overflow, use type long
4745                  * long to store it.
4746                  */
4747                 tmp = (long long)seq + count;
4748                 nxt_seq = nxt->sequence - nxt_offset;
4749                 tmp_nxt = (long long)nxt_seq + nxt_offset;
4750
4751                 /*
4752                  * cached_sq_head may overflow, and it will never overflow twice
4753                  * once there is some timeout req still be valid.
4754                  */
4755                 if (seq < nxt_seq)
4756                         tmp += UINT_MAX;
4757
4758                 if (tmp > tmp_nxt)
4759                         break;
4760
4761                 /*
4762                  * Sequence of reqs after the insert one and itself should
4763                  * be adjusted because each timeout req consumes a slot.
4764                  */
4765                 span++;
4766                 nxt->sequence++;
4767         }
4768         req->sequence -= span;
4769 add:
4770         list_add(&req->list, entry);
4771         data->timer.function = io_timeout_fn;
4772         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
4773         spin_unlock_irq(&ctx->completion_lock);
4774         return 0;
4775 }
4776
4777 static bool io_cancel_cb(struct io_wq_work *work, void *data)
4778 {
4779         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4780
4781         return req->user_data == (unsigned long) data;
4782 }
4783
4784 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
4785 {
4786         enum io_wq_cancel cancel_ret;
4787         int ret = 0;
4788
4789         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
4790         switch (cancel_ret) {
4791         case IO_WQ_CANCEL_OK:
4792                 ret = 0;
4793                 break;
4794         case IO_WQ_CANCEL_RUNNING:
4795                 ret = -EALREADY;
4796                 break;
4797         case IO_WQ_CANCEL_NOTFOUND:
4798                 ret = -ENOENT;
4799                 break;
4800         }
4801
4802         return ret;
4803 }
4804
4805 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
4806                                      struct io_kiocb *req, __u64 sqe_addr,
4807                                      int success_ret)
4808 {
4809         unsigned long flags;
4810         int ret;
4811
4812         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
4813         if (ret != -ENOENT) {
4814                 spin_lock_irqsave(&ctx->completion_lock, flags);
4815                 goto done;
4816         }
4817
4818         spin_lock_irqsave(&ctx->completion_lock, flags);
4819         ret = io_timeout_cancel(ctx, sqe_addr);
4820         if (ret != -ENOENT)
4821                 goto done;
4822         ret = io_poll_cancel(ctx, sqe_addr);
4823 done:
4824         if (!ret)
4825                 ret = success_ret;
4826         io_cqring_fill_event(req, ret);
4827         io_commit_cqring(ctx);
4828         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4829         io_cqring_ev_posted(ctx);
4830
4831         if (ret < 0)
4832                 req_set_fail_links(req);
4833         io_put_req(req);
4834 }
4835
4836 static int io_async_cancel_prep(struct io_kiocb *req,
4837                                 const struct io_uring_sqe *sqe)
4838 {
4839         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4840                 return -EINVAL;
4841         if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
4842             sqe->cancel_flags)
4843                 return -EINVAL;
4844
4845         req->cancel.addr = READ_ONCE(sqe->addr);
4846         return 0;
4847 }
4848
4849 static int io_async_cancel(struct io_kiocb *req)
4850 {
4851         struct io_ring_ctx *ctx = req->ctx;
4852
4853         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
4854         return 0;
4855 }
4856
4857 static int io_files_update_prep(struct io_kiocb *req,
4858                                 const struct io_uring_sqe *sqe)
4859 {
4860         if (sqe->flags || sqe->ioprio || sqe->rw_flags)
4861                 return -EINVAL;
4862
4863         req->files_update.offset = READ_ONCE(sqe->off);
4864         req->files_update.nr_args = READ_ONCE(sqe->len);
4865         if (!req->files_update.nr_args)
4866                 return -EINVAL;
4867         req->files_update.arg = READ_ONCE(sqe->addr);
4868         return 0;
4869 }
4870
4871 static int io_files_update(struct io_kiocb *req, bool force_nonblock)
4872 {
4873         struct io_ring_ctx *ctx = req->ctx;
4874         struct io_uring_files_update up;
4875         int ret;
4876
4877         if (force_nonblock)
4878                 return -EAGAIN;
4879
4880         up.offset = req->files_update.offset;
4881         up.fds = req->files_update.arg;
4882
4883         mutex_lock(&ctx->uring_lock);
4884         ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
4885         mutex_unlock(&ctx->uring_lock);
4886
4887         if (ret < 0)
4888                 req_set_fail_links(req);
4889         io_cqring_add_event(req, ret);
4890         io_put_req(req);
4891         return 0;
4892 }
4893
4894 static int io_req_defer_prep(struct io_kiocb *req,
4895                              const struct io_uring_sqe *sqe)
4896 {
4897         ssize_t ret = 0;
4898
4899         if (!sqe)
4900                 return 0;
4901
4902         if (io_op_defs[req->opcode].file_table) {
4903                 ret = io_grab_files(req);
4904                 if (unlikely(ret))
4905                         return ret;
4906         }
4907
4908         io_req_work_grab_env(req, &io_op_defs[req->opcode]);
4909
4910         switch (req->opcode) {
4911         case IORING_OP_NOP:
4912                 break;
4913         case IORING_OP_READV:
4914         case IORING_OP_READ_FIXED:
4915         case IORING_OP_READ:
4916                 ret = io_read_prep(req, sqe, true);
4917                 break;
4918         case IORING_OP_WRITEV:
4919         case IORING_OP_WRITE_FIXED:
4920         case IORING_OP_WRITE:
4921                 ret = io_write_prep(req, sqe, true);
4922                 break;
4923         case IORING_OP_POLL_ADD:
4924                 ret = io_poll_add_prep(req, sqe);
4925                 break;
4926         case IORING_OP_POLL_REMOVE:
4927                 ret = io_poll_remove_prep(req, sqe);
4928                 break;
4929         case IORING_OP_FSYNC:
4930                 ret = io_prep_fsync(req, sqe);
4931                 break;
4932         case IORING_OP_SYNC_FILE_RANGE:
4933                 ret = io_prep_sfr(req, sqe);
4934                 break;
4935         case IORING_OP_SENDMSG:
4936         case IORING_OP_SEND:
4937                 ret = io_sendmsg_prep(req, sqe);
4938                 break;
4939         case IORING_OP_RECVMSG:
4940         case IORING_OP_RECV:
4941                 ret = io_recvmsg_prep(req, sqe);
4942                 break;
4943         case IORING_OP_CONNECT:
4944                 ret = io_connect_prep(req, sqe);
4945                 break;
4946         case IORING_OP_TIMEOUT:
4947                 ret = io_timeout_prep(req, sqe, false);
4948                 break;
4949         case IORING_OP_TIMEOUT_REMOVE:
4950                 ret = io_timeout_remove_prep(req, sqe);
4951                 break;
4952         case IORING_OP_ASYNC_CANCEL:
4953                 ret = io_async_cancel_prep(req, sqe);
4954                 break;
4955         case IORING_OP_LINK_TIMEOUT:
4956                 ret = io_timeout_prep(req, sqe, true);
4957                 break;
4958         case IORING_OP_ACCEPT:
4959                 ret = io_accept_prep(req, sqe);
4960                 break;
4961         case IORING_OP_FALLOCATE:
4962                 ret = io_fallocate_prep(req, sqe);
4963                 break;
4964         case IORING_OP_OPENAT:
4965                 ret = io_openat_prep(req, sqe);
4966                 break;
4967         case IORING_OP_CLOSE:
4968                 ret = io_close_prep(req, sqe);
4969                 break;
4970         case IORING_OP_FILES_UPDATE:
4971                 ret = io_files_update_prep(req, sqe);
4972                 break;
4973         case IORING_OP_STATX:
4974                 ret = io_statx_prep(req, sqe);
4975                 break;
4976         case IORING_OP_FADVISE:
4977                 ret = io_fadvise_prep(req, sqe);
4978                 break;
4979         case IORING_OP_MADVISE:
4980                 ret = io_madvise_prep(req, sqe);
4981                 break;
4982         case IORING_OP_OPENAT2:
4983                 ret = io_openat2_prep(req, sqe);
4984                 break;
4985         case IORING_OP_EPOLL_CTL:
4986                 ret = io_epoll_ctl_prep(req, sqe);
4987                 break;
4988         case IORING_OP_SPLICE:
4989                 ret = io_splice_prep(req, sqe);
4990                 break;
4991         case IORING_OP_PROVIDE_BUFFERS:
4992                 ret = io_provide_buffers_prep(req, sqe);
4993                 break;
4994         case IORING_OP_REMOVE_BUFFERS:
4995                 ret = io_remove_buffers_prep(req, sqe);
4996                 break;
4997         default:
4998                 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
4999                                 req->opcode);
5000                 ret = -EINVAL;
5001                 break;
5002         }
5003
5004         return ret;
5005 }
5006
5007 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5008 {
5009         struct io_ring_ctx *ctx = req->ctx;
5010         int ret;
5011
5012         /* Still need defer if there is pending req in defer list. */
5013         if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
5014                 return 0;
5015
5016         if (!req->io) {
5017                 if (io_alloc_async_ctx(req))
5018                         return -EAGAIN;
5019                 ret = io_req_defer_prep(req, sqe);
5020                 if (ret < 0)
5021                         return ret;
5022         }
5023
5024         spin_lock_irq(&ctx->completion_lock);
5025         if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
5026                 spin_unlock_irq(&ctx->completion_lock);
5027                 return 0;
5028         }
5029
5030         trace_io_uring_defer(ctx, req, req->user_data);
5031         list_add_tail(&req->list, &ctx->defer_list);
5032         spin_unlock_irq(&ctx->completion_lock);
5033         return -EIOCBQUEUED;
5034 }
5035
5036 static void io_cleanup_req(struct io_kiocb *req)
5037 {
5038         struct io_async_ctx *io = req->io;
5039
5040         switch (req->opcode) {
5041         case IORING_OP_READV:
5042         case IORING_OP_READ_FIXED:
5043         case IORING_OP_READ:
5044                 if (req->flags & REQ_F_BUFFER_SELECTED)
5045                         kfree((void *)(unsigned long)req->rw.addr);
5046                 /* fallthrough */
5047         case IORING_OP_WRITEV:
5048         case IORING_OP_WRITE_FIXED:
5049         case IORING_OP_WRITE:
5050                 if (io->rw.iov != io->rw.fast_iov)
5051                         kfree(io->rw.iov);
5052                 break;
5053         case IORING_OP_RECVMSG:
5054                 if (req->flags & REQ_F_BUFFER_SELECTED)
5055                         kfree(req->sr_msg.kbuf);
5056                 /* fallthrough */
5057         case IORING_OP_SENDMSG:
5058                 if (io->msg.iov != io->msg.fast_iov)
5059                         kfree(io->msg.iov);
5060                 break;
5061         case IORING_OP_RECV:
5062                 if (req->flags & REQ_F_BUFFER_SELECTED)
5063                         kfree(req->sr_msg.kbuf);
5064                 break;
5065         case IORING_OP_OPENAT:
5066         case IORING_OP_OPENAT2:
5067         case IORING_OP_STATX:
5068                 putname(req->open.filename);
5069                 break;
5070         case IORING_OP_SPLICE:
5071                 io_put_file(req, req->splice.file_in,
5072                             (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5073                 break;
5074         }
5075
5076         req->flags &= ~REQ_F_NEED_CLEANUP;
5077 }
5078
5079 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5080                         bool force_nonblock)
5081 {
5082         struct io_ring_ctx *ctx = req->ctx;
5083         int ret;
5084
5085         switch (req->opcode) {
5086         case IORING_OP_NOP:
5087                 ret = io_nop(req);
5088                 break;
5089         case IORING_OP_READV:
5090         case IORING_OP_READ_FIXED:
5091         case IORING_OP_READ:
5092                 if (sqe) {
5093                         ret = io_read_prep(req, sqe, force_nonblock);
5094                         if (ret < 0)
5095                                 break;
5096                 }
5097                 ret = io_read(req, force_nonblock);
5098                 break;
5099         case IORING_OP_WRITEV:
5100         case IORING_OP_WRITE_FIXED:
5101         case IORING_OP_WRITE:
5102                 if (sqe) {
5103                         ret = io_write_prep(req, sqe, force_nonblock);
5104                         if (ret < 0)
5105                                 break;
5106                 }
5107                 ret = io_write(req, force_nonblock);
5108                 break;
5109         case IORING_OP_FSYNC:
5110                 if (sqe) {
5111                         ret = io_prep_fsync(req, sqe);
5112                         if (ret < 0)
5113                                 break;
5114                 }
5115                 ret = io_fsync(req, force_nonblock);
5116                 break;
5117         case IORING_OP_POLL_ADD:
5118                 if (sqe) {
5119                         ret = io_poll_add_prep(req, sqe);
5120                         if (ret)
5121                                 break;
5122                 }
5123                 ret = io_poll_add(req);
5124                 break;
5125         case IORING_OP_POLL_REMOVE:
5126                 if (sqe) {
5127                         ret = io_poll_remove_prep(req, sqe);
5128                         if (ret < 0)
5129                                 break;
5130                 }
5131                 ret = io_poll_remove(req);
5132                 break;
5133         case IORING_OP_SYNC_FILE_RANGE:
5134                 if (sqe) {
5135                         ret = io_prep_sfr(req, sqe);
5136                         if (ret < 0)
5137                                 break;
5138                 }
5139                 ret = io_sync_file_range(req, force_nonblock);
5140                 break;
5141         case IORING_OP_SENDMSG:
5142         case IORING_OP_SEND:
5143                 if (sqe) {
5144                         ret = io_sendmsg_prep(req, sqe);
5145                         if (ret < 0)
5146                                 break;
5147                 }
5148                 if (req->opcode == IORING_OP_SENDMSG)
5149                         ret = io_sendmsg(req, force_nonblock);
5150                 else
5151                         ret = io_send(req, force_nonblock);
5152                 break;
5153         case IORING_OP_RECVMSG:
5154         case IORING_OP_RECV:
5155                 if (sqe) {
5156                         ret = io_recvmsg_prep(req, sqe);
5157                         if (ret)
5158                                 break;
5159                 }
5160                 if (req->opcode == IORING_OP_RECVMSG)
5161                         ret = io_recvmsg(req, force_nonblock);
5162                 else
5163                         ret = io_recv(req, force_nonblock);
5164                 break;
5165         case IORING_OP_TIMEOUT:
5166                 if (sqe) {
5167                         ret = io_timeout_prep(req, sqe, false);
5168                         if (ret)
5169                                 break;
5170                 }
5171                 ret = io_timeout(req);
5172                 break;
5173         case IORING_OP_TIMEOUT_REMOVE:
5174                 if (sqe) {
5175                         ret = io_timeout_remove_prep(req, sqe);
5176                         if (ret)
5177                                 break;
5178                 }
5179                 ret = io_timeout_remove(req);
5180                 break;
5181         case IORING_OP_ACCEPT:
5182                 if (sqe) {
5183                         ret = io_accept_prep(req, sqe);
5184                         if (ret)
5185                                 break;
5186                 }
5187                 ret = io_accept(req, force_nonblock);
5188                 break;
5189         case IORING_OP_CONNECT:
5190                 if (sqe) {
5191                         ret = io_connect_prep(req, sqe);
5192                         if (ret)
5193                                 break;
5194                 }
5195                 ret = io_connect(req, force_nonblock);
5196                 break;
5197         case IORING_OP_ASYNC_CANCEL:
5198                 if (sqe) {
5199                         ret = io_async_cancel_prep(req, sqe);
5200                         if (ret)
5201                                 break;
5202                 }
5203                 ret = io_async_cancel(req);
5204                 break;
5205         case IORING_OP_FALLOCATE:
5206                 if (sqe) {
5207                         ret = io_fallocate_prep(req, sqe);
5208                         if (ret)
5209                                 break;
5210                 }
5211                 ret = io_fallocate(req, force_nonblock);
5212                 break;
5213         case IORING_OP_OPENAT:
5214                 if (sqe) {
5215                         ret = io_openat_prep(req, sqe);
5216                         if (ret)
5217                                 break;
5218                 }
5219                 ret = io_openat(req, force_nonblock);
5220                 break;
5221         case IORING_OP_CLOSE:
5222                 if (sqe) {
5223                         ret = io_close_prep(req, sqe);
5224                         if (ret)
5225                                 break;
5226                 }
5227                 ret = io_close(req, force_nonblock);
5228                 break;
5229         case IORING_OP_FILES_UPDATE:
5230                 if (sqe) {
5231                         ret = io_files_update_prep(req, sqe);
5232                         if (ret)
5233                                 break;
5234                 }
5235                 ret = io_files_update(req, force_nonblock);
5236                 break;
5237         case IORING_OP_STATX:
5238                 if (sqe) {
5239                         ret = io_statx_prep(req, sqe);
5240                         if (ret)
5241                                 break;
5242                 }
5243                 ret = io_statx(req, force_nonblock);
5244                 break;
5245         case IORING_OP_FADVISE:
5246                 if (sqe) {
5247                         ret = io_fadvise_prep(req, sqe);
5248                         if (ret)
5249                                 break;
5250                 }
5251                 ret = io_fadvise(req, force_nonblock);
5252                 break;
5253         case IORING_OP_MADVISE:
5254                 if (sqe) {
5255                         ret = io_madvise_prep(req, sqe);
5256                         if (ret)
5257                                 break;
5258                 }
5259                 ret = io_madvise(req, force_nonblock);
5260                 break;
5261         case IORING_OP_OPENAT2:
5262                 if (sqe) {
5263                         ret = io_openat2_prep(req, sqe);
5264                         if (ret)
5265                                 break;
5266                 }
5267                 ret = io_openat2(req, force_nonblock);
5268                 break;
5269         case IORING_OP_EPOLL_CTL:
5270                 if (sqe) {
5271                         ret = io_epoll_ctl_prep(req, sqe);
5272                         if (ret)
5273                                 break;
5274                 }
5275                 ret = io_epoll_ctl(req, force_nonblock);
5276                 break;
5277         case IORING_OP_SPLICE:
5278                 if (sqe) {
5279                         ret = io_splice_prep(req, sqe);
5280                         if (ret < 0)
5281                                 break;
5282                 }
5283                 ret = io_splice(req, force_nonblock);
5284                 break;
5285         case IORING_OP_PROVIDE_BUFFERS:
5286                 if (sqe) {
5287                         ret = io_provide_buffers_prep(req, sqe);
5288                         if (ret)
5289                                 break;
5290                 }
5291                 ret = io_provide_buffers(req, force_nonblock);
5292                 break;
5293         case IORING_OP_REMOVE_BUFFERS:
5294                 if (sqe) {
5295                         ret = io_remove_buffers_prep(req, sqe);
5296                         if (ret)
5297                                 break;
5298                 }
5299                 ret = io_remove_buffers(req, force_nonblock);
5300                 break;
5301         default:
5302                 ret = -EINVAL;
5303                 break;
5304         }
5305
5306         if (ret)
5307                 return ret;
5308
5309         /* If the op doesn't have a file, we're not polling for it */
5310         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
5311                 const bool in_async = io_wq_current_is_worker();
5312
5313                 if (req->result == -EAGAIN)
5314                         return -EAGAIN;
5315
5316                 /* workqueue context doesn't hold uring_lock, grab it now */
5317                 if (in_async)
5318                         mutex_lock(&ctx->uring_lock);
5319
5320                 io_iopoll_req_issued(req);
5321
5322                 if (in_async)
5323                         mutex_unlock(&ctx->uring_lock);
5324         }
5325
5326         return 0;
5327 }
5328
5329 static void io_wq_submit_work(struct io_wq_work **workptr)
5330 {
5331         struct io_wq_work *work = *workptr;
5332         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5333         int ret = 0;
5334
5335         /* if NO_CANCEL is set, we must still run the work */
5336         if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
5337                                 IO_WQ_WORK_CANCEL) {
5338                 ret = -ECANCELED;
5339         }
5340
5341         if (!ret) {
5342                 do {
5343                         ret = io_issue_sqe(req, NULL, false);
5344                         /*
5345                          * We can get EAGAIN for polled IO even though we're
5346                          * forcing a sync submission from here, since we can't
5347                          * wait for request slots on the block side.
5348                          */
5349                         if (ret != -EAGAIN)
5350                                 break;
5351                         cond_resched();
5352                 } while (1);
5353         }
5354
5355         if (ret) {
5356                 req_set_fail_links(req);
5357                 io_cqring_add_event(req, ret);
5358                 io_put_req(req);
5359         }
5360
5361         io_steal_work(req, workptr);
5362 }
5363
5364 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
5365                                               int index)
5366 {
5367         struct fixed_file_table *table;
5368
5369         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
5370         return table->files[index & IORING_FILE_TABLE_MASK];;
5371 }
5372
5373 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
5374                         int fd, struct file **out_file, bool fixed)
5375 {
5376         struct io_ring_ctx *ctx = req->ctx;
5377         struct file *file;
5378
5379         if (fixed) {
5380                 if (unlikely(!ctx->file_data ||
5381                     (unsigned) fd >= ctx->nr_user_files))
5382                         return -EBADF;
5383                 fd = array_index_nospec(fd, ctx->nr_user_files);
5384                 file = io_file_from_index(ctx, fd);
5385                 if (!file)
5386                         return -EBADF;
5387                 req->fixed_file_refs = ctx->file_data->cur_refs;
5388                 percpu_ref_get(req->fixed_file_refs);
5389         } else {
5390                 trace_io_uring_file_get(ctx, fd);
5391                 file = __io_file_get(state, fd);
5392                 if (unlikely(!file))
5393                         return -EBADF;
5394         }
5395
5396         *out_file = file;
5397         return 0;
5398 }
5399
5400 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
5401                            int fd)
5402 {
5403         bool fixed;
5404
5405         fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
5406         if (unlikely(!fixed && req->needs_fixed_file))
5407                 return -EBADF;
5408
5409         return io_file_get(state, req, fd, &req->file, fixed);
5410 }
5411
5412 static int io_grab_files(struct io_kiocb *req)
5413 {
5414         int ret = -EBADF;
5415         struct io_ring_ctx *ctx = req->ctx;
5416
5417         if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
5418                 return 0;
5419         if (!ctx->ring_file)
5420                 return -EBADF;
5421
5422         rcu_read_lock();
5423         spin_lock_irq(&ctx->inflight_lock);
5424         /*
5425          * We use the f_ops->flush() handler to ensure that we can flush
5426          * out work accessing these files if the fd is closed. Check if
5427          * the fd has changed since we started down this path, and disallow
5428          * this operation if it has.
5429          */
5430         if (fcheck(ctx->ring_fd) == ctx->ring_file) {
5431                 list_add(&req->inflight_entry, &ctx->inflight_list);
5432                 req->flags |= REQ_F_INFLIGHT;
5433                 req->work.files = current->files;
5434                 ret = 0;
5435         }
5436         spin_unlock_irq(&ctx->inflight_lock);
5437         rcu_read_unlock();
5438
5439         return ret;
5440 }
5441
5442 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
5443 {
5444         struct io_timeout_data *data = container_of(timer,
5445                                                 struct io_timeout_data, timer);
5446         struct io_kiocb *req = data->req;
5447         struct io_ring_ctx *ctx = req->ctx;
5448         struct io_kiocb *prev = NULL;
5449         unsigned long flags;
5450
5451         spin_lock_irqsave(&ctx->completion_lock, flags);
5452
5453         /*
5454          * We don't expect the list to be empty, that will only happen if we
5455          * race with the completion of the linked work.
5456          */
5457         if (!list_empty(&req->link_list)) {
5458                 prev = list_entry(req->link_list.prev, struct io_kiocb,
5459                                   link_list);
5460                 if (refcount_inc_not_zero(&prev->refs)) {
5461                         list_del_init(&req->link_list);
5462                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
5463                 } else
5464                         prev = NULL;
5465         }
5466
5467         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5468
5469         if (prev) {
5470                 req_set_fail_links(prev);
5471                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
5472                 io_put_req(prev);
5473         } else {
5474                 io_cqring_add_event(req, -ETIME);
5475                 io_put_req(req);
5476         }
5477         return HRTIMER_NORESTART;
5478 }
5479
5480 static void io_queue_linked_timeout(struct io_kiocb *req)
5481 {
5482         struct io_ring_ctx *ctx = req->ctx;
5483
5484         /*
5485          * If the list is now empty, then our linked request finished before
5486          * we got a chance to setup the timer
5487          */
5488         spin_lock_irq(&ctx->completion_lock);
5489         if (!list_empty(&req->link_list)) {
5490                 struct io_timeout_data *data = &req->io->timeout;
5491
5492                 data->timer.function = io_link_timeout_fn;
5493                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
5494                                 data->mode);
5495         }
5496         spin_unlock_irq(&ctx->completion_lock);
5497
5498         /* drop submission reference */
5499         io_put_req(req);
5500 }
5501
5502 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
5503 {
5504         struct io_kiocb *nxt;
5505
5506         if (!(req->flags & REQ_F_LINK_HEAD))
5507                 return NULL;
5508         /* for polled retry, if flag is set, we already went through here */
5509         if (req->flags & REQ_F_POLLED)
5510                 return NULL;
5511
5512         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
5513                                         link_list);
5514         if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
5515                 return NULL;
5516
5517         req->flags |= REQ_F_LINK_TIMEOUT;
5518         return nxt;
5519 }
5520
5521 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5522 {
5523         struct io_kiocb *linked_timeout;
5524         struct io_kiocb *nxt;
5525         const struct cred *old_creds = NULL;
5526         int ret;
5527
5528 again:
5529         linked_timeout = io_prep_linked_timeout(req);
5530
5531         if (req->work.creds && req->work.creds != current_cred()) {
5532                 if (old_creds)
5533                         revert_creds(old_creds);
5534                 if (old_creds == req->work.creds)
5535                         old_creds = NULL; /* restored original creds */
5536                 else
5537                         old_creds = override_creds(req->work.creds);
5538         }
5539
5540         ret = io_issue_sqe(req, sqe, true);
5541
5542         /*
5543          * We async punt it if the file wasn't marked NOWAIT, or if the file
5544          * doesn't support non-blocking read/write attempts
5545          */
5546         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
5547             (req->flags & REQ_F_MUST_PUNT))) {
5548                 if (io_arm_poll_handler(req)) {
5549                         if (linked_timeout)
5550                                 io_queue_linked_timeout(linked_timeout);
5551                         goto exit;
5552                 }
5553 punt:
5554                 if (io_op_defs[req->opcode].file_table) {
5555                         ret = io_grab_files(req);
5556                         if (ret)
5557                                 goto err;
5558                 }
5559
5560                 /*
5561                  * Queued up for async execution, worker will release
5562                  * submit reference when the iocb is actually submitted.
5563                  */
5564                 io_queue_async_work(req);
5565                 goto exit;
5566         }
5567
5568 err:
5569         nxt = NULL;
5570         /* drop submission reference */
5571         io_put_req_find_next(req, &nxt);
5572
5573         if (linked_timeout) {
5574                 if (!ret)
5575                         io_queue_linked_timeout(linked_timeout);
5576                 else
5577                         io_put_req(linked_timeout);
5578         }
5579
5580         /* and drop final reference, if we failed */
5581         if (ret) {
5582                 io_cqring_add_event(req, ret);
5583                 req_set_fail_links(req);
5584                 io_put_req(req);
5585         }
5586         if (nxt) {
5587                 req = nxt;
5588
5589                 if (req->flags & REQ_F_FORCE_ASYNC)
5590                         goto punt;
5591                 goto again;
5592         }
5593 exit:
5594         if (old_creds)
5595                 revert_creds(old_creds);
5596 }
5597
5598 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5599 {
5600         int ret;
5601
5602         ret = io_req_defer(req, sqe);
5603         if (ret) {
5604                 if (ret != -EIOCBQUEUED) {
5605 fail_req:
5606                         io_cqring_add_event(req, ret);
5607                         req_set_fail_links(req);
5608                         io_double_put_req(req);
5609                 }
5610         } else if (req->flags & REQ_F_FORCE_ASYNC) {
5611                 if (!req->io) {
5612                         ret = -EAGAIN;
5613                         if (io_alloc_async_ctx(req))
5614                                 goto fail_req;
5615                         ret = io_req_defer_prep(req, sqe);
5616                         if (unlikely(ret < 0))
5617                                 goto fail_req;
5618                 }
5619
5620                 /*
5621                  * Never try inline submit of IOSQE_ASYNC is set, go straight
5622                  * to async execution.
5623                  */
5624                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
5625                 io_queue_async_work(req);
5626         } else {
5627                 __io_queue_sqe(req, sqe);
5628         }
5629 }
5630
5631 static inline void io_queue_link_head(struct io_kiocb *req)
5632 {
5633         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
5634                 io_cqring_add_event(req, -ECANCELED);
5635                 io_double_put_req(req);
5636         } else
5637                 io_queue_sqe(req, NULL);
5638 }
5639
5640 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5641                           struct io_submit_state *state, struct io_kiocb **link)
5642 {
5643         struct io_ring_ctx *ctx = req->ctx;
5644         int ret;
5645
5646         /*
5647          * If we already have a head request, queue this one for async
5648          * submittal once the head completes. If we don't have a head but
5649          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
5650          * submitted sync once the chain is complete. If none of those
5651          * conditions are true (normal request), then just queue it.
5652          */
5653         if (*link) {
5654                 struct io_kiocb *head = *link;
5655
5656                 /*
5657                  * Taking sequential execution of a link, draining both sides
5658                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
5659                  * requests in the link. So, it drains the head and the
5660                  * next after the link request. The last one is done via
5661                  * drain_next flag to persist the effect across calls.
5662                  */
5663                 if (req->flags & REQ_F_IO_DRAIN) {
5664                         head->flags |= REQ_F_IO_DRAIN;
5665                         ctx->drain_next = 1;
5666                 }
5667                 if (io_alloc_async_ctx(req))
5668                         return -EAGAIN;
5669
5670                 ret = io_req_defer_prep(req, sqe);
5671                 if (ret) {
5672                         /* fail even hard links since we don't submit */
5673                         head->flags |= REQ_F_FAIL_LINK;
5674                         return ret;
5675                 }
5676                 trace_io_uring_link(ctx, req, head);
5677                 list_add_tail(&req->link_list, &head->link_list);
5678
5679                 /* last request of a link, enqueue the link */
5680                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
5681                         io_queue_link_head(head);
5682                         *link = NULL;
5683                 }
5684         } else {
5685                 if (unlikely(ctx->drain_next)) {
5686                         req->flags |= REQ_F_IO_DRAIN;
5687                         ctx->drain_next = 0;
5688                 }
5689                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
5690                         req->flags |= REQ_F_LINK_HEAD;
5691                         INIT_LIST_HEAD(&req->link_list);
5692
5693                         if (io_alloc_async_ctx(req))
5694                                 return -EAGAIN;
5695
5696                         ret = io_req_defer_prep(req, sqe);
5697                         if (ret)
5698                                 req->flags |= REQ_F_FAIL_LINK;
5699                         *link = req;
5700                 } else {
5701                         io_queue_sqe(req, sqe);
5702                 }
5703         }
5704
5705         return 0;
5706 }
5707
5708 /*
5709  * Batched submission is done, ensure local IO is flushed out.
5710  */
5711 static void io_submit_state_end(struct io_submit_state *state)
5712 {
5713         blk_finish_plug(&state->plug);
5714         io_file_put(state);
5715         if (state->free_reqs)
5716                 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
5717 }
5718
5719 /*
5720  * Start submission side cache.
5721  */
5722 static void io_submit_state_start(struct io_submit_state *state,
5723                                   unsigned int max_ios)
5724 {
5725         blk_start_plug(&state->plug);
5726         state->free_reqs = 0;
5727         state->file = NULL;
5728         state->ios_left = max_ios;
5729 }
5730
5731 static void io_commit_sqring(struct io_ring_ctx *ctx)
5732 {
5733         struct io_rings *rings = ctx->rings;
5734
5735         /*
5736          * Ensure any loads from the SQEs are done at this point,
5737          * since once we write the new head, the application could
5738          * write new data to them.
5739          */
5740         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
5741 }
5742
5743 /*
5744  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
5745  * that is mapped by userspace. This means that care needs to be taken to
5746  * ensure that reads are stable, as we cannot rely on userspace always
5747  * being a good citizen. If members of the sqe are validated and then later
5748  * used, it's important that those reads are done through READ_ONCE() to
5749  * prevent a re-load down the line.
5750  */
5751 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
5752 {
5753         u32 *sq_array = ctx->sq_array;
5754         unsigned head;
5755
5756         /*
5757          * The cached sq head (or cq tail) serves two purposes:
5758          *
5759          * 1) allows us to batch the cost of updating the user visible
5760          *    head updates.
5761          * 2) allows the kernel side to track the head on its own, even
5762          *    though the application is the one updating it.
5763          */
5764         head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
5765         if (likely(head < ctx->sq_entries))
5766                 return &ctx->sq_sqes[head];
5767
5768         /* drop invalid entries */
5769         ctx->cached_sq_dropped++;
5770         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
5771         return NULL;
5772 }
5773
5774 static inline void io_consume_sqe(struct io_ring_ctx *ctx)
5775 {
5776         ctx->cached_sq_head++;
5777 }
5778
5779 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
5780                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
5781                                 IOSQE_BUFFER_SELECT)
5782
5783 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
5784                        const struct io_uring_sqe *sqe,
5785                        struct io_submit_state *state, bool async)
5786 {
5787         unsigned int sqe_flags;
5788         int id;
5789
5790         /*
5791          * All io need record the previous position, if LINK vs DARIN,
5792          * it can be used to mark the position of the first IO in the
5793          * link list.
5794          */
5795         req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
5796         req->opcode = READ_ONCE(sqe->opcode);
5797         req->user_data = READ_ONCE(sqe->user_data);
5798         req->io = NULL;
5799         req->file = NULL;
5800         req->ctx = ctx;
5801         req->flags = 0;
5802         /* one is dropped after submission, the other at completion */
5803         refcount_set(&req->refs, 2);
5804         req->task = NULL;
5805         req->result = 0;
5806         req->needs_fixed_file = async;
5807         INIT_IO_WORK(&req->work, io_wq_submit_work);
5808
5809         if (unlikely(req->opcode >= IORING_OP_LAST))
5810                 return -EINVAL;
5811
5812         if (io_op_defs[req->opcode].needs_mm && !current->mm) {
5813                 if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
5814                         return -EFAULT;
5815                 use_mm(ctx->sqo_mm);
5816         }
5817
5818         sqe_flags = READ_ONCE(sqe->flags);
5819         /* enforce forwards compatibility on users */
5820         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
5821                 return -EINVAL;
5822
5823         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
5824             !io_op_defs[req->opcode].buffer_select)
5825                 return -EOPNOTSUPP;
5826
5827         id = READ_ONCE(sqe->personality);
5828         if (id) {
5829                 req->work.creds = idr_find(&ctx->personality_idr, id);
5830                 if (unlikely(!req->work.creds))
5831                         return -EINVAL;
5832                 get_cred(req->work.creds);
5833         }
5834
5835         /* same numerical values with corresponding REQ_F_*, safe to copy */
5836         req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
5837                                         IOSQE_ASYNC | IOSQE_FIXED_FILE |
5838                                         IOSQE_BUFFER_SELECT | IOSQE_IO_LINK);
5839
5840         if (!io_op_defs[req->opcode].needs_file)
5841                 return 0;
5842
5843         return io_req_set_file(state, req, READ_ONCE(sqe->fd));
5844 }
5845
5846 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
5847                           struct file *ring_file, int ring_fd, bool async)
5848 {
5849         struct io_submit_state state, *statep = NULL;
5850         struct io_kiocb *link = NULL;
5851         int i, submitted = 0;
5852
5853         /* if we have a backlog and couldn't flush it all, return BUSY */
5854         if (test_bit(0, &ctx->sq_check_overflow)) {
5855                 if (!list_empty(&ctx->cq_overflow_list) &&
5856                     !io_cqring_overflow_flush(ctx, false))
5857                         return -EBUSY;
5858         }
5859
5860         /* make sure SQ entry isn't read before tail */
5861         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
5862
5863         if (!percpu_ref_tryget_many(&ctx->refs, nr))
5864                 return -EAGAIN;
5865
5866         if (nr > IO_PLUG_THRESHOLD) {
5867                 io_submit_state_start(&state, nr);
5868                 statep = &state;
5869         }
5870
5871         ctx->ring_fd = ring_fd;
5872         ctx->ring_file = ring_file;
5873
5874         for (i = 0; i < nr; i++) {
5875                 const struct io_uring_sqe *sqe;
5876                 struct io_kiocb *req;
5877                 int err;
5878
5879                 sqe = io_get_sqe(ctx);
5880                 if (unlikely(!sqe)) {
5881                         io_consume_sqe(ctx);
5882                         break;
5883                 }
5884                 req = io_alloc_req(ctx, statep);
5885                 if (unlikely(!req)) {
5886                         if (!submitted)
5887                                 submitted = -EAGAIN;
5888                         break;
5889                 }
5890
5891                 err = io_init_req(ctx, req, sqe, statep, async);
5892                 io_consume_sqe(ctx);
5893                 /* will complete beyond this point, count as submitted */
5894                 submitted++;
5895
5896                 if (unlikely(err)) {
5897 fail_req:
5898                         io_cqring_add_event(req, err);
5899                         io_double_put_req(req);
5900                         break;
5901                 }
5902
5903                 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
5904                                                 true, async);
5905                 err = io_submit_sqe(req, sqe, statep, &link);
5906                 if (err)
5907                         goto fail_req;
5908         }
5909
5910         if (unlikely(submitted != nr)) {
5911                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
5912
5913                 percpu_ref_put_many(&ctx->refs, nr - ref_used);
5914         }
5915         if (link)
5916                 io_queue_link_head(link);
5917         if (statep)
5918                 io_submit_state_end(&state);
5919
5920          /* Commit SQ ring head once we've consumed and submitted all SQEs */
5921         io_commit_sqring(ctx);
5922
5923         return submitted;
5924 }
5925
5926 static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
5927 {
5928         struct mm_struct *mm = current->mm;
5929
5930         if (mm) {
5931                 unuse_mm(mm);
5932                 mmput(mm);
5933         }
5934 }
5935
5936 static int io_sq_thread(void *data)
5937 {
5938         struct io_ring_ctx *ctx = data;
5939         const struct cred *old_cred;
5940         mm_segment_t old_fs;
5941         DEFINE_WAIT(wait);
5942         unsigned long timeout;
5943         int ret = 0;
5944
5945         complete(&ctx->completions[1]);
5946
5947         old_fs = get_fs();
5948         set_fs(USER_DS);
5949         old_cred = override_creds(ctx->creds);
5950
5951         timeout = jiffies + ctx->sq_thread_idle;
5952         while (!kthread_should_park()) {
5953                 unsigned int to_submit;
5954
5955                 if (!list_empty(&ctx->poll_list)) {
5956                         unsigned nr_events = 0;
5957
5958                         mutex_lock(&ctx->uring_lock);
5959                         if (!list_empty(&ctx->poll_list))
5960                                 io_iopoll_getevents(ctx, &nr_events, 0);
5961                         else
5962                                 timeout = jiffies + ctx->sq_thread_idle;
5963                         mutex_unlock(&ctx->uring_lock);
5964                 }
5965
5966                 to_submit = io_sqring_entries(ctx);
5967
5968                 /*
5969                  * If submit got -EBUSY, flag us as needing the application
5970                  * to enter the kernel to reap and flush events.
5971                  */
5972                 if (!to_submit || ret == -EBUSY) {
5973                         /*
5974                          * Drop cur_mm before scheduling, we can't hold it for
5975                          * long periods (or over schedule()). Do this before
5976                          * adding ourselves to the waitqueue, as the unuse/drop
5977                          * may sleep.
5978                          */
5979                         io_sq_thread_drop_mm(ctx);
5980
5981                         /*
5982                          * We're polling. If we're within the defined idle
5983                          * period, then let us spin without work before going
5984                          * to sleep. The exception is if we got EBUSY doing
5985                          * more IO, we should wait for the application to
5986                          * reap events and wake us up.
5987                          */
5988                         if (!list_empty(&ctx->poll_list) ||
5989                             (!time_after(jiffies, timeout) && ret != -EBUSY &&
5990                             !percpu_ref_is_dying(&ctx->refs))) {
5991                                 if (current->task_works)
5992                                         task_work_run();
5993                                 cond_resched();
5994                                 continue;
5995                         }
5996
5997                         prepare_to_wait(&ctx->sqo_wait, &wait,
5998                                                 TASK_INTERRUPTIBLE);
5999
6000                         /*
6001                          * While doing polled IO, before going to sleep, we need
6002                          * to check if there are new reqs added to poll_list, it
6003                          * is because reqs may have been punted to io worker and
6004                          * will be added to poll_list later, hence check the
6005                          * poll_list again.
6006                          */
6007                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6008                             !list_empty_careful(&ctx->poll_list)) {
6009                                 finish_wait(&ctx->sqo_wait, &wait);
6010                                 continue;
6011                         }
6012
6013                         /* Tell userspace we may need a wakeup call */
6014                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6015                         /* make sure to read SQ tail after writing flags */
6016                         smp_mb();
6017
6018                         to_submit = io_sqring_entries(ctx);
6019                         if (!to_submit || ret == -EBUSY) {
6020                                 if (kthread_should_park()) {
6021                                         finish_wait(&ctx->sqo_wait, &wait);
6022                                         break;
6023                                 }
6024                                 if (current->task_works) {
6025                                         task_work_run();
6026                                         finish_wait(&ctx->sqo_wait, &wait);
6027                                         continue;
6028                                 }
6029                                 if (signal_pending(current))
6030                                         flush_signals(current);
6031                                 schedule();
6032                                 finish_wait(&ctx->sqo_wait, &wait);
6033
6034                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6035                                 ret = 0;
6036                                 continue;
6037                         }
6038                         finish_wait(&ctx->sqo_wait, &wait);
6039
6040                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6041                 }
6042
6043                 mutex_lock(&ctx->uring_lock);
6044                 ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
6045                 mutex_unlock(&ctx->uring_lock);
6046                 timeout = jiffies + ctx->sq_thread_idle;
6047         }
6048
6049         if (current->task_works)
6050                 task_work_run();
6051
6052         set_fs(old_fs);
6053         io_sq_thread_drop_mm(ctx);
6054         revert_creds(old_cred);
6055
6056         kthread_parkme();
6057
6058         return 0;
6059 }
6060
6061 struct io_wait_queue {
6062         struct wait_queue_entry wq;
6063         struct io_ring_ctx *ctx;
6064         unsigned to_wait;
6065         unsigned nr_timeouts;
6066 };
6067
6068 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
6069 {
6070         struct io_ring_ctx *ctx = iowq->ctx;
6071
6072         /*
6073          * Wake up if we have enough events, or if a timeout occurred since we
6074          * started waiting. For timeouts, we always want to return to userspace,
6075          * regardless of event count.
6076          */
6077         return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
6078                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6079 }
6080
6081 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6082                             int wake_flags, void *key)
6083 {
6084         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6085                                                         wq);
6086
6087         /* use noflush == true, as we can't safely rely on locking context */
6088         if (!io_should_wake(iowq, true))
6089                 return -1;
6090
6091         return autoremove_wake_function(curr, mode, wake_flags, key);
6092 }
6093
6094 /*
6095  * Wait until events become available, if we don't already have some. The
6096  * application must reap them itself, as they reside on the shared cq ring.
6097  */
6098 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6099                           const sigset_t __user *sig, size_t sigsz)
6100 {
6101         struct io_wait_queue iowq = {
6102                 .wq = {
6103                         .private        = current,
6104                         .func           = io_wake_function,
6105                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6106                 },
6107                 .ctx            = ctx,
6108                 .to_wait        = min_events,
6109         };
6110         struct io_rings *rings = ctx->rings;
6111         int ret = 0;
6112
6113         do {
6114                 if (io_cqring_events(ctx, false) >= min_events)
6115                         return 0;
6116                 if (!current->task_works)
6117                         break;
6118                 task_work_run();
6119         } while (1);
6120
6121         if (sig) {
6122 #ifdef CONFIG_COMPAT
6123                 if (in_compat_syscall())
6124                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6125                                                       sigsz);
6126                 else
6127 #endif
6128                         ret = set_user_sigmask(sig, sigsz);
6129
6130                 if (ret)
6131                         return ret;
6132         }
6133
6134         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6135         trace_io_uring_cqring_wait(ctx, min_events);
6136         do {
6137                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6138                                                 TASK_INTERRUPTIBLE);
6139                 if (current->task_works)
6140                         task_work_run();
6141                 if (io_should_wake(&iowq, false))
6142                         break;
6143                 schedule();
6144                 if (signal_pending(current)) {
6145                         ret = -EINTR;
6146                         break;
6147                 }
6148         } while (1);
6149         finish_wait(&ctx->wait, &iowq.wq);
6150
6151         restore_saved_sigmask_unless(ret == -EINTR);
6152
6153         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6154 }
6155
6156 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6157 {
6158 #if defined(CONFIG_UNIX)
6159         if (ctx->ring_sock) {
6160                 struct sock *sock = ctx->ring_sock->sk;
6161                 struct sk_buff *skb;
6162
6163                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6164                         kfree_skb(skb);
6165         }
6166 #else
6167         int i;
6168
6169         for (i = 0; i < ctx->nr_user_files; i++) {
6170                 struct file *file;
6171
6172                 file = io_file_from_index(ctx, i);
6173                 if (file)
6174                         fput(file);
6175         }
6176 #endif
6177 }
6178
6179 static void io_file_ref_kill(struct percpu_ref *ref)
6180 {
6181         struct fixed_file_data *data;
6182
6183         data = container_of(ref, struct fixed_file_data, refs);
6184         complete(&data->done);
6185 }
6186
6187 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6188 {
6189         struct fixed_file_data *data = ctx->file_data;
6190         struct fixed_file_ref_node *ref_node = NULL;
6191         unsigned nr_tables, i;
6192         unsigned long flags;
6193
6194         if (!data)
6195                 return -ENXIO;
6196
6197         spin_lock_irqsave(&data->lock, flags);
6198         if (!list_empty(&data->ref_list))
6199                 ref_node = list_first_entry(&data->ref_list,
6200                                 struct fixed_file_ref_node, node);
6201         spin_unlock_irqrestore(&data->lock, flags);
6202         if (ref_node)
6203                 percpu_ref_kill(&ref_node->refs);
6204
6205         percpu_ref_kill(&data->refs);
6206
6207         /* wait for all refs nodes to complete */
6208         wait_for_completion(&data->done);
6209
6210         __io_sqe_files_unregister(ctx);
6211         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
6212         for (i = 0; i < nr_tables; i++)
6213                 kfree(data->table[i].files);
6214         kfree(data->table);
6215         percpu_ref_exit(&data->refs);
6216         kfree(data);
6217         ctx->file_data = NULL;
6218         ctx->nr_user_files = 0;
6219         return 0;
6220 }
6221
6222 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
6223 {
6224         if (ctx->sqo_thread) {
6225                 wait_for_completion(&ctx->completions[1]);
6226                 /*
6227                  * The park is a bit of a work-around, without it we get
6228                  * warning spews on shutdown with SQPOLL set and affinity
6229                  * set to a single CPU.
6230                  */
6231                 kthread_park(ctx->sqo_thread);
6232                 kthread_stop(ctx->sqo_thread);
6233                 ctx->sqo_thread = NULL;
6234         }
6235 }
6236
6237 static void io_finish_async(struct io_ring_ctx *ctx)
6238 {
6239         io_sq_thread_stop(ctx);
6240
6241         if (ctx->io_wq) {
6242                 io_wq_destroy(ctx->io_wq);
6243                 ctx->io_wq = NULL;
6244         }
6245 }
6246
6247 #if defined(CONFIG_UNIX)
6248 /*
6249  * Ensure the UNIX gc is aware of our file set, so we are certain that
6250  * the io_uring can be safely unregistered on process exit, even if we have
6251  * loops in the file referencing.
6252  */
6253 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
6254 {
6255         struct sock *sk = ctx->ring_sock->sk;
6256         struct scm_fp_list *fpl;
6257         struct sk_buff *skb;
6258         int i, nr_files;
6259
6260         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
6261         if (!fpl)
6262                 return -ENOMEM;
6263
6264         skb = alloc_skb(0, GFP_KERNEL);
6265         if (!skb) {
6266                 kfree(fpl);
6267                 return -ENOMEM;
6268         }
6269
6270         skb->sk = sk;
6271
6272         nr_files = 0;
6273         fpl->user = get_uid(ctx->user);
6274         for (i = 0; i < nr; i++) {
6275                 struct file *file = io_file_from_index(ctx, i + offset);
6276
6277                 if (!file)
6278                         continue;
6279                 fpl->fp[nr_files] = get_file(file);
6280                 unix_inflight(fpl->user, fpl->fp[nr_files]);
6281                 nr_files++;
6282         }
6283
6284         if (nr_files) {
6285                 fpl->max = SCM_MAX_FD;
6286                 fpl->count = nr_files;
6287                 UNIXCB(skb).fp = fpl;
6288                 skb->destructor = unix_destruct_scm;
6289                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
6290                 skb_queue_head(&sk->sk_receive_queue, skb);
6291
6292                 for (i = 0; i < nr_files; i++)
6293                         fput(fpl->fp[i]);
6294         } else {
6295                 kfree_skb(skb);
6296                 kfree(fpl);
6297         }
6298
6299         return 0;
6300 }
6301
6302 /*
6303  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
6304  * causes regular reference counting to break down. We rely on the UNIX
6305  * garbage collection to take care of this problem for us.
6306  */
6307 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6308 {
6309         unsigned left, total;
6310         int ret = 0;
6311
6312         total = 0;
6313         left = ctx->nr_user_files;
6314         while (left) {
6315                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6316
6317                 ret = __io_sqe_files_scm(ctx, this_files, total);
6318                 if (ret)
6319                         break;
6320                 left -= this_files;
6321                 total += this_files;
6322         }
6323
6324         if (!ret)
6325                 return 0;
6326
6327         while (total < ctx->nr_user_files) {
6328                 struct file *file = io_file_from_index(ctx, total);
6329
6330                 if (file)
6331                         fput(file);
6332                 total++;
6333         }
6334
6335         return ret;
6336 }
6337 #else
6338 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6339 {
6340         return 0;
6341 }
6342 #endif
6343
6344 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
6345                                     unsigned nr_files)
6346 {
6347         int i;
6348
6349         for (i = 0; i < nr_tables; i++) {
6350                 struct fixed_file_table *table = &ctx->file_data->table[i];
6351                 unsigned this_files;
6352
6353                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
6354                 table->files = kcalloc(this_files, sizeof(struct file *),
6355                                         GFP_KERNEL);
6356                 if (!table->files)
6357                         break;
6358                 nr_files -= this_files;
6359         }
6360
6361         if (i == nr_tables)
6362                 return 0;
6363
6364         for (i = 0; i < nr_tables; i++) {
6365                 struct fixed_file_table *table = &ctx->file_data->table[i];
6366                 kfree(table->files);
6367         }
6368         return 1;
6369 }
6370
6371 static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
6372 {
6373 #if defined(CONFIG_UNIX)
6374         struct sock *sock = ctx->ring_sock->sk;
6375         struct sk_buff_head list, *head = &sock->sk_receive_queue;
6376         struct sk_buff *skb;
6377         int i;
6378
6379         __skb_queue_head_init(&list);
6380
6381         /*
6382          * Find the skb that holds this file in its SCM_RIGHTS. When found,
6383          * remove this entry and rearrange the file array.
6384          */
6385         skb = skb_dequeue(head);
6386         while (skb) {
6387                 struct scm_fp_list *fp;
6388
6389                 fp = UNIXCB(skb).fp;
6390                 for (i = 0; i < fp->count; i++) {
6391                         int left;
6392
6393                         if (fp->fp[i] != file)
6394                                 continue;
6395
6396                         unix_notinflight(fp->user, fp->fp[i]);
6397                         left = fp->count - 1 - i;
6398                         if (left) {
6399                                 memmove(&fp->fp[i], &fp->fp[i + 1],
6400                                                 left * sizeof(struct file *));
6401                         }
6402                         fp->count--;
6403                         if (!fp->count) {
6404                                 kfree_skb(skb);
6405                                 skb = NULL;
6406                         } else {
6407                                 __skb_queue_tail(&list, skb);
6408                         }
6409                         fput(file);
6410                         file = NULL;
6411                         break;
6412                 }
6413
6414                 if (!file)
6415                         break;
6416
6417                 __skb_queue_tail(&list, skb);
6418
6419                 skb = skb_dequeue(head);
6420         }
6421
6422         if (skb_peek(&list)) {
6423                 spin_lock_irq(&head->lock);
6424                 while ((skb = __skb_dequeue(&list)) != NULL)
6425                         __skb_queue_tail(head, skb);
6426                 spin_unlock_irq(&head->lock);
6427         }
6428 #else
6429         fput(file);
6430 #endif
6431 }
6432
6433 struct io_file_put {
6434         struct list_head list;
6435         struct file *file;
6436 };
6437
6438 static void io_file_put_work(struct work_struct *work)
6439 {
6440         struct fixed_file_ref_node *ref_node;
6441         struct fixed_file_data *file_data;
6442         struct io_ring_ctx *ctx;
6443         struct io_file_put *pfile, *tmp;
6444         unsigned long flags;
6445
6446         ref_node = container_of(work, struct fixed_file_ref_node, work);
6447         file_data = ref_node->file_data;
6448         ctx = file_data->ctx;
6449
6450         list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
6451                 list_del_init(&pfile->list);
6452                 io_ring_file_put(ctx, pfile->file);
6453                 kfree(pfile);
6454         }
6455
6456         spin_lock_irqsave(&file_data->lock, flags);
6457         list_del_init(&ref_node->node);
6458         spin_unlock_irqrestore(&file_data->lock, flags);
6459
6460         percpu_ref_exit(&ref_node->refs);
6461         kfree(ref_node);
6462         percpu_ref_put(&file_data->refs);
6463 }
6464
6465 static void io_file_data_ref_zero(struct percpu_ref *ref)
6466 {
6467         struct fixed_file_ref_node *ref_node;
6468
6469         ref_node = container_of(ref, struct fixed_file_ref_node, refs);
6470
6471         queue_work(system_wq, &ref_node->work);
6472 }
6473
6474 static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
6475                         struct io_ring_ctx *ctx)
6476 {
6477         struct fixed_file_ref_node *ref_node;
6478
6479         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
6480         if (!ref_node)
6481                 return ERR_PTR(-ENOMEM);
6482
6483         if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
6484                             0, GFP_KERNEL)) {
6485                 kfree(ref_node);
6486                 return ERR_PTR(-ENOMEM);
6487         }
6488         INIT_LIST_HEAD(&ref_node->node);
6489         INIT_LIST_HEAD(&ref_node->file_list);
6490         INIT_WORK(&ref_node->work, io_file_put_work);
6491         ref_node->file_data = ctx->file_data;
6492         return ref_node;
6493
6494 }
6495
6496 static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
6497 {
6498         percpu_ref_exit(&ref_node->refs);
6499         kfree(ref_node);
6500 }
6501
6502 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
6503                                  unsigned nr_args)
6504 {
6505         __s32 __user *fds = (__s32 __user *) arg;
6506         unsigned nr_tables;
6507         struct file *file;
6508         int fd, ret = 0;
6509         unsigned i;
6510         struct fixed_file_ref_node *ref_node;
6511         unsigned long flags;
6512
6513         if (ctx->file_data)
6514                 return -EBUSY;
6515         if (!nr_args)
6516                 return -EINVAL;
6517         if (nr_args > IORING_MAX_FIXED_FILES)
6518                 return -EMFILE;
6519
6520         ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
6521         if (!ctx->file_data)
6522                 return -ENOMEM;
6523         ctx->file_data->ctx = ctx;
6524         init_completion(&ctx->file_data->done);
6525         INIT_LIST_HEAD(&ctx->file_data->ref_list);
6526         spin_lock_init(&ctx->file_data->lock);
6527
6528         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
6529         ctx->file_data->table = kcalloc(nr_tables,
6530                                         sizeof(struct fixed_file_table),
6531                                         GFP_KERNEL);
6532         if (!ctx->file_data->table) {
6533                 kfree(ctx->file_data);
6534                 ctx->file_data = NULL;
6535                 return -ENOMEM;
6536         }
6537
6538         if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
6539                                 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6540                 kfree(ctx->file_data->table);
6541                 kfree(ctx->file_data);
6542                 ctx->file_data = NULL;
6543                 return -ENOMEM;
6544         }
6545
6546         if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
6547                 percpu_ref_exit(&ctx->file_data->refs);
6548                 kfree(ctx->file_data->table);
6549                 kfree(ctx->file_data);
6550                 ctx->file_data = NULL;
6551                 return -ENOMEM;
6552         }
6553
6554         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
6555                 struct fixed_file_table *table;
6556                 unsigned index;
6557
6558                 ret = -EFAULT;
6559                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
6560                         break;
6561                 /* allow sparse sets */
6562                 if (fd == -1) {
6563                         ret = 0;
6564                         continue;
6565                 }
6566
6567                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6568                 index = i & IORING_FILE_TABLE_MASK;
6569                 file = fget(fd);
6570
6571                 ret = -EBADF;
6572                 if (!file)
6573                         break;
6574
6575                 /*
6576                  * Don't allow io_uring instances to be registered. If UNIX
6577                  * isn't enabled, then this causes a reference cycle and this
6578                  * instance can never get freed. If UNIX is enabled we'll
6579                  * handle it just fine, but there's still no point in allowing
6580                  * a ring fd as it doesn't support regular read/write anyway.
6581                  */
6582                 if (file->f_op == &io_uring_fops) {
6583                         fput(file);
6584                         break;
6585                 }
6586                 ret = 0;
6587                 table->files[index] = file;
6588         }
6589
6590         if (ret) {
6591                 for (i = 0; i < ctx->nr_user_files; i++) {
6592                         file = io_file_from_index(ctx, i);
6593                         if (file)
6594                                 fput(file);
6595                 }
6596                 for (i = 0; i < nr_tables; i++)
6597                         kfree(ctx->file_data->table[i].files);
6598
6599                 kfree(ctx->file_data->table);
6600                 kfree(ctx->file_data);
6601                 ctx->file_data = NULL;
6602                 ctx->nr_user_files = 0;
6603                 return ret;
6604         }
6605
6606         ret = io_sqe_files_scm(ctx);
6607         if (ret) {
6608                 io_sqe_files_unregister(ctx);
6609                 return ret;
6610         }
6611
6612         ref_node = alloc_fixed_file_ref_node(ctx);
6613         if (IS_ERR(ref_node)) {
6614                 io_sqe_files_unregister(ctx);
6615                 return PTR_ERR(ref_node);
6616         }
6617
6618         ctx->file_data->cur_refs = &ref_node->refs;
6619         spin_lock_irqsave(&ctx->file_data->lock, flags);
6620         list_add(&ref_node->node, &ctx->file_data->ref_list);
6621         spin_unlock_irqrestore(&ctx->file_data->lock, flags);
6622         percpu_ref_get(&ctx->file_data->refs);
6623         return ret;
6624 }
6625
6626 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
6627                                 int index)
6628 {
6629 #if defined(CONFIG_UNIX)
6630         struct sock *sock = ctx->ring_sock->sk;
6631         struct sk_buff_head *head = &sock->sk_receive_queue;
6632         struct sk_buff *skb;
6633
6634         /*
6635          * See if we can merge this file into an existing skb SCM_RIGHTS
6636          * file set. If there's no room, fall back to allocating a new skb
6637          * and filling it in.
6638          */
6639         spin_lock_irq(&head->lock);
6640         skb = skb_peek(head);
6641         if (skb) {
6642                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
6643
6644                 if (fpl->count < SCM_MAX_FD) {
6645                         __skb_unlink(skb, head);
6646                         spin_unlock_irq(&head->lock);
6647                         fpl->fp[fpl->count] = get_file(file);
6648                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
6649                         fpl->count++;
6650                         spin_lock_irq(&head->lock);
6651                         __skb_queue_head(head, skb);
6652                 } else {
6653                         skb = NULL;
6654                 }
6655         }
6656         spin_unlock_irq(&head->lock);
6657
6658         if (skb) {
6659                 fput(file);
6660                 return 0;
6661         }
6662
6663         return __io_sqe_files_scm(ctx, 1, index);
6664 #else
6665         return 0;
6666 #endif
6667 }
6668
6669 static int io_queue_file_removal(struct fixed_file_data *data,
6670                                  struct file *file)
6671 {
6672         struct io_file_put *pfile;
6673         struct percpu_ref *refs = data->cur_refs;
6674         struct fixed_file_ref_node *ref_node;
6675
6676         pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
6677         if (!pfile)
6678                 return -ENOMEM;
6679
6680         ref_node = container_of(refs, struct fixed_file_ref_node, refs);
6681         pfile->file = file;
6682         list_add(&pfile->list, &ref_node->file_list);
6683
6684         return 0;
6685 }
6686
6687 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
6688                                  struct io_uring_files_update *up,
6689                                  unsigned nr_args)
6690 {
6691         struct fixed_file_data *data = ctx->file_data;
6692         struct fixed_file_ref_node *ref_node;
6693         struct file *file;
6694         __s32 __user *fds;
6695         int fd, i, err;
6696         __u32 done;
6697         unsigned long flags;
6698         bool needs_switch = false;
6699
6700         if (check_add_overflow(up->offset, nr_args, &done))
6701                 return -EOVERFLOW;
6702         if (done > ctx->nr_user_files)
6703                 return -EINVAL;
6704
6705         ref_node = alloc_fixed_file_ref_node(ctx);
6706         if (IS_ERR(ref_node))
6707                 return PTR_ERR(ref_node);
6708
6709         done = 0;
6710         fds = u64_to_user_ptr(up->fds);
6711         while (nr_args) {
6712                 struct fixed_file_table *table;
6713                 unsigned index;
6714
6715                 err = 0;
6716                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
6717                         err = -EFAULT;
6718                         break;
6719                 }
6720                 i = array_index_nospec(up->offset, ctx->nr_user_files);
6721                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6722                 index = i & IORING_FILE_TABLE_MASK;
6723                 if (table->files[index]) {
6724                         file = io_file_from_index(ctx, index);
6725                         err = io_queue_file_removal(data, file);
6726                         if (err)
6727                                 break;
6728                         table->files[index] = NULL;
6729                         needs_switch = true;
6730                 }
6731                 if (fd != -1) {
6732                         file = fget(fd);
6733                         if (!file) {
6734                                 err = -EBADF;
6735                                 break;
6736                         }
6737                         /*
6738                          * Don't allow io_uring instances to be registered. If
6739                          * UNIX isn't enabled, then this causes a reference
6740                          * cycle and this instance can never get freed. If UNIX
6741                          * is enabled we'll handle it just fine, but there's
6742                          * still no point in allowing a ring fd as it doesn't
6743                          * support regular read/write anyway.
6744                          */
6745                         if (file->f_op == &io_uring_fops) {
6746                                 fput(file);
6747                                 err = -EBADF;
6748                                 break;
6749                         }
6750                         table->files[index] = file;
6751                         err = io_sqe_file_register(ctx, file, i);
6752                         if (err)
6753                                 break;
6754                 }
6755                 nr_args--;
6756                 done++;
6757                 up->offset++;
6758         }
6759
6760         if (needs_switch) {
6761                 percpu_ref_kill(data->cur_refs);
6762                 spin_lock_irqsave(&data->lock, flags);
6763                 list_add(&ref_node->node, &data->ref_list);
6764                 data->cur_refs = &ref_node->refs;
6765                 spin_unlock_irqrestore(&data->lock, flags);
6766                 percpu_ref_get(&ctx->file_data->refs);
6767         } else
6768                 destroy_fixed_file_ref_node(ref_node);
6769
6770         return done ? done : err;
6771 }
6772
6773 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
6774                                unsigned nr_args)
6775 {
6776         struct io_uring_files_update up;
6777
6778         if (!ctx->file_data)
6779                 return -ENXIO;
6780         if (!nr_args)
6781                 return -EINVAL;
6782         if (copy_from_user(&up, arg, sizeof(up)))
6783                 return -EFAULT;
6784         if (up.resv)
6785                 return -EINVAL;
6786
6787         return __io_sqe_files_update(ctx, &up, nr_args);
6788 }
6789
6790 static void io_free_work(struct io_wq_work *work)
6791 {
6792         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6793
6794         /* Consider that io_steal_work() relies on this ref */
6795         io_put_req(req);
6796 }
6797
6798 static int io_init_wq_offload(struct io_ring_ctx *ctx,
6799                               struct io_uring_params *p)
6800 {
6801         struct io_wq_data data;
6802         struct fd f;
6803         struct io_ring_ctx *ctx_attach;
6804         unsigned int concurrency;
6805         int ret = 0;
6806
6807         data.user = ctx->user;
6808         data.free_work = io_free_work;
6809
6810         if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
6811                 /* Do QD, or 4 * CPUS, whatever is smallest */
6812                 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
6813
6814                 ctx->io_wq = io_wq_create(concurrency, &data);
6815                 if (IS_ERR(ctx->io_wq)) {
6816                         ret = PTR_ERR(ctx->io_wq);
6817                         ctx->io_wq = NULL;
6818                 }
6819                 return ret;
6820         }
6821
6822         f = fdget(p->wq_fd);
6823         if (!f.file)
6824                 return -EBADF;
6825
6826         if (f.file->f_op != &io_uring_fops) {
6827                 ret = -EINVAL;
6828                 goto out_fput;
6829         }
6830
6831         ctx_attach = f.file->private_data;
6832         /* @io_wq is protected by holding the fd */
6833         if (!io_wq_get(ctx_attach->io_wq, &data)) {
6834                 ret = -EINVAL;
6835                 goto out_fput;
6836         }
6837
6838         ctx->io_wq = ctx_attach->io_wq;
6839 out_fput:
6840         fdput(f);
6841         return ret;
6842 }
6843
6844 static int io_sq_offload_start(struct io_ring_ctx *ctx,
6845                                struct io_uring_params *p)
6846 {
6847         int ret;
6848
6849         mmgrab(current->mm);
6850         ctx->sqo_mm = current->mm;
6851
6852         if (ctx->flags & IORING_SETUP_SQPOLL) {
6853                 ret = -EPERM;
6854                 if (!capable(CAP_SYS_ADMIN))
6855                         goto err;
6856
6857                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
6858                 if (!ctx->sq_thread_idle)
6859                         ctx->sq_thread_idle = HZ;
6860
6861                 if (p->flags & IORING_SETUP_SQ_AFF) {
6862                         int cpu = p->sq_thread_cpu;
6863
6864                         ret = -EINVAL;
6865                         if (cpu >= nr_cpu_ids)
6866                                 goto err;
6867                         if (!cpu_online(cpu))
6868                                 goto err;
6869
6870                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
6871                                                         ctx, cpu,
6872                                                         "io_uring-sq");
6873                 } else {
6874                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
6875                                                         "io_uring-sq");
6876                 }
6877                 if (IS_ERR(ctx->sqo_thread)) {
6878                         ret = PTR_ERR(ctx->sqo_thread);
6879                         ctx->sqo_thread = NULL;
6880                         goto err;
6881                 }
6882                 wake_up_process(ctx->sqo_thread);
6883         } else if (p->flags & IORING_SETUP_SQ_AFF) {
6884                 /* Can't have SQ_AFF without SQPOLL */
6885                 ret = -EINVAL;
6886                 goto err;
6887         }
6888
6889         ret = io_init_wq_offload(ctx, p);
6890         if (ret)
6891                 goto err;
6892
6893         return 0;
6894 err:
6895         io_finish_async(ctx);
6896         mmdrop(ctx->sqo_mm);
6897         ctx->sqo_mm = NULL;
6898         return ret;
6899 }
6900
6901 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
6902 {
6903         atomic_long_sub(nr_pages, &user->locked_vm);
6904 }
6905
6906 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
6907 {
6908         unsigned long page_limit, cur_pages, new_pages;
6909
6910         /* Don't allow more pages than we can safely lock */
6911         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
6912
6913         do {
6914                 cur_pages = atomic_long_read(&user->locked_vm);
6915                 new_pages = cur_pages + nr_pages;
6916                 if (new_pages > page_limit)
6917                         return -ENOMEM;
6918         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
6919                                         new_pages) != cur_pages);
6920
6921         return 0;
6922 }
6923
6924 static void io_mem_free(void *ptr)
6925 {
6926         struct page *page;
6927
6928         if (!ptr)
6929                 return;
6930
6931         page = virt_to_head_page(ptr);
6932         if (put_page_testzero(page))
6933                 free_compound_page(page);
6934 }
6935
6936 static void *io_mem_alloc(size_t size)
6937 {
6938         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
6939                                 __GFP_NORETRY;
6940
6941         return (void *) __get_free_pages(gfp_flags, get_order(size));
6942 }
6943
6944 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
6945                                 size_t *sq_offset)
6946 {
6947         struct io_rings *rings;
6948         size_t off, sq_array_size;
6949
6950         off = struct_size(rings, cqes, cq_entries);
6951         if (off == SIZE_MAX)
6952                 return SIZE_MAX;
6953
6954 #ifdef CONFIG_SMP
6955         off = ALIGN(off, SMP_CACHE_BYTES);
6956         if (off == 0)
6957                 return SIZE_MAX;
6958 #endif
6959
6960         sq_array_size = array_size(sizeof(u32), sq_entries);
6961         if (sq_array_size == SIZE_MAX)
6962                 return SIZE_MAX;
6963
6964         if (check_add_overflow(off, sq_array_size, &off))
6965                 return SIZE_MAX;
6966
6967         if (sq_offset)
6968                 *sq_offset = off;
6969
6970         return off;
6971 }
6972
6973 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
6974 {
6975         size_t pages;
6976
6977         pages = (size_t)1 << get_order(
6978                 rings_size(sq_entries, cq_entries, NULL));
6979         pages += (size_t)1 << get_order(
6980                 array_size(sizeof(struct io_uring_sqe), sq_entries));
6981
6982         return pages;
6983 }
6984
6985 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
6986 {
6987         int i, j;
6988
6989         if (!ctx->user_bufs)
6990                 return -ENXIO;
6991
6992         for (i = 0; i < ctx->nr_user_bufs; i++) {
6993                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
6994
6995                 for (j = 0; j < imu->nr_bvecs; j++)
6996                         unpin_user_page(imu->bvec[j].bv_page);
6997
6998                 if (ctx->account_mem)
6999                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
7000                 kvfree(imu->bvec);
7001                 imu->nr_bvecs = 0;
7002         }
7003
7004         kfree(ctx->user_bufs);
7005         ctx->user_bufs = NULL;
7006         ctx->nr_user_bufs = 0;
7007         return 0;
7008 }
7009
7010 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
7011                        void __user *arg, unsigned index)
7012 {
7013         struct iovec __user *src;
7014
7015 #ifdef CONFIG_COMPAT
7016         if (ctx->compat) {
7017                 struct compat_iovec __user *ciovs;
7018                 struct compat_iovec ciov;
7019
7020                 ciovs = (struct compat_iovec __user *) arg;
7021                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
7022                         return -EFAULT;
7023
7024                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
7025                 dst->iov_len = ciov.iov_len;
7026                 return 0;
7027         }
7028 #endif
7029         src = (struct iovec __user *) arg;
7030         if (copy_from_user(dst, &src[index], sizeof(*dst)))
7031                 return -EFAULT;
7032         return 0;
7033 }
7034
7035 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
7036                                   unsigned nr_args)
7037 {
7038         struct vm_area_struct **vmas = NULL;
7039         struct page **pages = NULL;
7040         int i, j, got_pages = 0;
7041         int ret = -EINVAL;
7042
7043         if (ctx->user_bufs)
7044                 return -EBUSY;
7045         if (!nr_args || nr_args > UIO_MAXIOV)
7046                 return -EINVAL;
7047
7048         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
7049                                         GFP_KERNEL);
7050         if (!ctx->user_bufs)
7051                 return -ENOMEM;
7052
7053         for (i = 0; i < nr_args; i++) {
7054                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7055                 unsigned long off, start, end, ubuf;
7056                 int pret, nr_pages;
7057                 struct iovec iov;
7058                 size_t size;
7059
7060                 ret = io_copy_iov(ctx, &iov, arg, i);
7061                 if (ret)
7062                         goto err;
7063
7064                 /*
7065                  * Don't impose further limits on the size and buffer
7066                  * constraints here, we'll -EINVAL later when IO is
7067                  * submitted if they are wrong.
7068                  */
7069                 ret = -EFAULT;
7070                 if (!iov.iov_base || !iov.iov_len)
7071                         goto err;
7072
7073                 /* arbitrary limit, but we need something */
7074                 if (iov.iov_len > SZ_1G)
7075                         goto err;
7076
7077                 ubuf = (unsigned long) iov.iov_base;
7078                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
7079                 start = ubuf >> PAGE_SHIFT;
7080                 nr_pages = end - start;
7081
7082                 if (ctx->account_mem) {
7083                         ret = io_account_mem(ctx->user, nr_pages);
7084                         if (ret)
7085                                 goto err;
7086                 }
7087
7088                 ret = 0;
7089                 if (!pages || nr_pages > got_pages) {
7090                         kfree(vmas);
7091                         kfree(pages);
7092                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
7093                                                 GFP_KERNEL);
7094                         vmas = kvmalloc_array(nr_pages,
7095                                         sizeof(struct vm_area_struct *),
7096                                         GFP_KERNEL);
7097                         if (!pages || !vmas) {
7098                                 ret = -ENOMEM;
7099                                 if (ctx->account_mem)
7100                                         io_unaccount_mem(ctx->user, nr_pages);
7101                                 goto err;
7102                         }
7103                         got_pages = nr_pages;
7104                 }
7105
7106                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
7107                                                 GFP_KERNEL);
7108                 ret = -ENOMEM;
7109                 if (!imu->bvec) {
7110                         if (ctx->account_mem)
7111                                 io_unaccount_mem(ctx->user, nr_pages);
7112                         goto err;
7113                 }
7114
7115                 ret = 0;
7116                 down_read(&current->mm->mmap_sem);
7117                 pret = pin_user_pages(ubuf, nr_pages,
7118                                       FOLL_WRITE | FOLL_LONGTERM,
7119                                       pages, vmas);
7120                 if (pret == nr_pages) {
7121                         /* don't support file backed memory */
7122                         for (j = 0; j < nr_pages; j++) {
7123                                 struct vm_area_struct *vma = vmas[j];
7124
7125                                 if (vma->vm_file &&
7126                                     !is_file_hugepages(vma->vm_file)) {
7127                                         ret = -EOPNOTSUPP;
7128                                         break;
7129                                 }
7130                         }
7131                 } else {
7132                         ret = pret < 0 ? pret : -EFAULT;
7133                 }
7134                 up_read(&current->mm->mmap_sem);
7135                 if (ret) {
7136                         /*
7137                          * if we did partial map, or found file backed vmas,
7138                          * release any pages we did get
7139                          */
7140                         if (pret > 0)
7141                                 unpin_user_pages(pages, pret);
7142                         if (ctx->account_mem)
7143                                 io_unaccount_mem(ctx->user, nr_pages);
7144                         kvfree(imu->bvec);
7145                         goto err;
7146                 }
7147
7148                 off = ubuf & ~PAGE_MASK;
7149                 size = iov.iov_len;
7150                 for (j = 0; j < nr_pages; j++) {
7151                         size_t vec_len;
7152
7153                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
7154                         imu->bvec[j].bv_page = pages[j];
7155                         imu->bvec[j].bv_len = vec_len;
7156                         imu->bvec[j].bv_offset = off;
7157                         off = 0;
7158                         size -= vec_len;
7159                 }
7160                 /* store original address for later verification */
7161                 imu->ubuf = ubuf;
7162                 imu->len = iov.iov_len;
7163                 imu->nr_bvecs = nr_pages;
7164
7165                 ctx->nr_user_bufs++;
7166         }
7167         kvfree(pages);
7168         kvfree(vmas);
7169         return 0;
7170 err:
7171         kvfree(pages);
7172         kvfree(vmas);
7173         io_sqe_buffer_unregister(ctx);
7174         return ret;
7175 }
7176
7177 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
7178 {
7179         __s32 __user *fds = arg;
7180         int fd;
7181
7182         if (ctx->cq_ev_fd)
7183                 return -EBUSY;
7184
7185         if (copy_from_user(&fd, fds, sizeof(*fds)))
7186                 return -EFAULT;
7187
7188         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
7189         if (IS_ERR(ctx->cq_ev_fd)) {
7190                 int ret = PTR_ERR(ctx->cq_ev_fd);
7191                 ctx->cq_ev_fd = NULL;
7192                 return ret;
7193         }
7194
7195         return 0;
7196 }
7197
7198 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
7199 {
7200         if (ctx->cq_ev_fd) {
7201                 eventfd_ctx_put(ctx->cq_ev_fd);
7202                 ctx->cq_ev_fd = NULL;
7203                 return 0;
7204         }
7205
7206         return -ENXIO;
7207 }
7208
7209 static int __io_destroy_buffers(int id, void *p, void *data)
7210 {
7211         struct io_ring_ctx *ctx = data;
7212         struct io_buffer *buf = p;
7213
7214         __io_remove_buffers(ctx, buf, id, -1U);
7215         return 0;
7216 }
7217
7218 static void io_destroy_buffers(struct io_ring_ctx *ctx)
7219 {
7220         idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
7221         idr_destroy(&ctx->io_buffer_idr);
7222 }
7223
7224 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
7225 {
7226         io_finish_async(ctx);
7227         if (ctx->sqo_mm)
7228                 mmdrop(ctx->sqo_mm);
7229
7230         io_iopoll_reap_events(ctx);
7231         io_sqe_buffer_unregister(ctx);
7232         io_sqe_files_unregister(ctx);
7233         io_eventfd_unregister(ctx);
7234         io_destroy_buffers(ctx);
7235         idr_destroy(&ctx->personality_idr);
7236
7237 #if defined(CONFIG_UNIX)
7238         if (ctx->ring_sock) {
7239                 ctx->ring_sock->file = NULL; /* so that iput() is called */
7240                 sock_release(ctx->ring_sock);
7241         }
7242 #endif
7243
7244         io_mem_free(ctx->rings);
7245         io_mem_free(ctx->sq_sqes);
7246
7247         percpu_ref_exit(&ctx->refs);
7248         if (ctx->account_mem)
7249                 io_unaccount_mem(ctx->user,
7250                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
7251         free_uid(ctx->user);
7252         put_cred(ctx->creds);
7253         kfree(ctx->completions);
7254         kfree(ctx->cancel_hash);
7255         kmem_cache_free(req_cachep, ctx->fallback_req);
7256         kfree(ctx);
7257 }
7258
7259 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
7260 {
7261         struct io_ring_ctx *ctx = file->private_data;
7262         __poll_t mask = 0;
7263
7264         poll_wait(file, &ctx->cq_wait, wait);
7265         /*
7266          * synchronizes with barrier from wq_has_sleeper call in
7267          * io_commit_cqring
7268          */
7269         smp_rmb();
7270         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
7271             ctx->rings->sq_ring_entries)
7272                 mask |= EPOLLOUT | EPOLLWRNORM;
7273         if (io_cqring_events(ctx, false))
7274                 mask |= EPOLLIN | EPOLLRDNORM;
7275
7276         return mask;
7277 }
7278
7279 static int io_uring_fasync(int fd, struct file *file, int on)
7280 {
7281         struct io_ring_ctx *ctx = file->private_data;
7282
7283         return fasync_helper(fd, file, on, &ctx->cq_fasync);
7284 }
7285
7286 static int io_remove_personalities(int id, void *p, void *data)
7287 {
7288         struct io_ring_ctx *ctx = data;
7289         const struct cred *cred;
7290
7291         cred = idr_remove(&ctx->personality_idr, id);
7292         if (cred)
7293                 put_cred(cred);
7294         return 0;
7295 }
7296
7297 static void io_ring_exit_work(struct work_struct *work)
7298 {
7299         struct io_ring_ctx *ctx;
7300
7301         ctx = container_of(work, struct io_ring_ctx, exit_work);
7302         if (ctx->rings)
7303                 io_cqring_overflow_flush(ctx, true);
7304
7305         wait_for_completion(&ctx->completions[0]);
7306         io_ring_ctx_free(ctx);
7307 }
7308
7309 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
7310 {
7311         mutex_lock(&ctx->uring_lock);
7312         percpu_ref_kill(&ctx->refs);
7313         mutex_unlock(&ctx->uring_lock);
7314
7315         /*
7316          * Wait for sq thread to idle, if we have one. It won't spin on new
7317          * work after we've killed the ctx ref above. This is important to do
7318          * before we cancel existing commands, as the thread could otherwise
7319          * be queueing new work post that. If that's work we need to cancel,
7320          * it could cause shutdown to hang.
7321          */
7322         while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
7323                 cond_resched();
7324
7325         io_kill_timeouts(ctx);
7326         io_poll_remove_all(ctx);
7327
7328         if (ctx->io_wq)
7329                 io_wq_cancel_all(ctx->io_wq);
7330
7331         io_iopoll_reap_events(ctx);
7332         /* if we failed setting up the ctx, we might not have any rings */
7333         if (ctx->rings)
7334                 io_cqring_overflow_flush(ctx, true);
7335         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
7336         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
7337         queue_work(system_wq, &ctx->exit_work);
7338 }
7339
7340 static int io_uring_release(struct inode *inode, struct file *file)
7341 {
7342         struct io_ring_ctx *ctx = file->private_data;
7343
7344         file->private_data = NULL;
7345         io_ring_ctx_wait_and_kill(ctx);
7346         return 0;
7347 }
7348
7349 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
7350                                   struct files_struct *files)
7351 {
7352         while (!list_empty_careful(&ctx->inflight_list)) {
7353                 struct io_kiocb *cancel_req = NULL, *req;
7354                 DEFINE_WAIT(wait);
7355
7356                 spin_lock_irq(&ctx->inflight_lock);
7357                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
7358                         if (req->work.files != files)
7359                                 continue;
7360                         /* req is being completed, ignore */
7361                         if (!refcount_inc_not_zero(&req->refs))
7362                                 continue;
7363                         cancel_req = req;
7364                         break;
7365                 }
7366                 if (cancel_req)
7367                         prepare_to_wait(&ctx->inflight_wait, &wait,
7368                                                 TASK_UNINTERRUPTIBLE);
7369                 spin_unlock_irq(&ctx->inflight_lock);
7370
7371                 /* We need to keep going until we don't find a matching req */
7372                 if (!cancel_req)
7373                         break;
7374
7375                 if (cancel_req->flags & REQ_F_OVERFLOW) {
7376                         spin_lock_irq(&ctx->completion_lock);
7377                         list_del(&cancel_req->list);
7378                         cancel_req->flags &= ~REQ_F_OVERFLOW;
7379                         if (list_empty(&ctx->cq_overflow_list)) {
7380                                 clear_bit(0, &ctx->sq_check_overflow);
7381                                 clear_bit(0, &ctx->cq_check_overflow);
7382                         }
7383                         spin_unlock_irq(&ctx->completion_lock);
7384
7385                         WRITE_ONCE(ctx->rings->cq_overflow,
7386                                 atomic_inc_return(&ctx->cached_cq_overflow));
7387
7388                         /*
7389                          * Put inflight ref and overflow ref. If that's
7390                          * all we had, then we're done with this request.
7391                          */
7392                         if (refcount_sub_and_test(2, &cancel_req->refs)) {
7393                                 io_put_req(cancel_req);
7394                                 finish_wait(&ctx->inflight_wait, &wait);
7395                                 continue;
7396                         }
7397                 }
7398
7399                 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
7400                 io_put_req(cancel_req);
7401                 schedule();
7402                 finish_wait(&ctx->inflight_wait, &wait);
7403         }
7404 }
7405
7406 static int io_uring_flush(struct file *file, void *data)
7407 {
7408         struct io_ring_ctx *ctx = file->private_data;
7409
7410         io_uring_cancel_files(ctx, data);
7411
7412         /*
7413          * If the task is going away, cancel work it may have pending
7414          */
7415         if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
7416                 io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
7417
7418         return 0;
7419 }
7420
7421 static void *io_uring_validate_mmap_request(struct file *file,
7422                                             loff_t pgoff, size_t sz)
7423 {
7424         struct io_ring_ctx *ctx = file->private_data;
7425         loff_t offset = pgoff << PAGE_SHIFT;
7426         struct page *page;
7427         void *ptr;
7428
7429         switch (offset) {
7430         case IORING_OFF_SQ_RING:
7431         case IORING_OFF_CQ_RING:
7432                 ptr = ctx->rings;
7433                 break;
7434         case IORING_OFF_SQES:
7435                 ptr = ctx->sq_sqes;
7436                 break;
7437         default:
7438                 return ERR_PTR(-EINVAL);
7439         }
7440
7441         page = virt_to_head_page(ptr);
7442         if (sz > page_size(page))
7443                 return ERR_PTR(-EINVAL);
7444
7445         return ptr;
7446 }
7447
7448 #ifdef CONFIG_MMU
7449
7450 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7451 {
7452         size_t sz = vma->vm_end - vma->vm_start;
7453         unsigned long pfn;
7454         void *ptr;
7455
7456         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
7457         if (IS_ERR(ptr))
7458                 return PTR_ERR(ptr);
7459
7460         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
7461         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
7462 }
7463
7464 #else /* !CONFIG_MMU */
7465
7466 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7467 {
7468         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
7469 }
7470
7471 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
7472 {
7473         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
7474 }
7475
7476 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
7477         unsigned long addr, unsigned long len,
7478         unsigned long pgoff, unsigned long flags)
7479 {
7480         void *ptr;
7481
7482         ptr = io_uring_validate_mmap_request(file, pgoff, len);
7483         if (IS_ERR(ptr))
7484                 return PTR_ERR(ptr);
7485
7486         return (unsigned long) ptr;
7487 }
7488
7489 #endif /* !CONFIG_MMU */
7490
7491 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
7492                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
7493                 size_t, sigsz)
7494 {
7495         struct io_ring_ctx *ctx;
7496         long ret = -EBADF;
7497         int submitted = 0;
7498         struct fd f;
7499
7500         if (current->task_works)
7501                 task_work_run();
7502
7503         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
7504                 return -EINVAL;
7505
7506         f = fdget(fd);
7507         if (!f.file)
7508                 return -EBADF;
7509
7510         ret = -EOPNOTSUPP;
7511         if (f.file->f_op != &io_uring_fops)
7512                 goto out_fput;
7513
7514         ret = -ENXIO;
7515         ctx = f.file->private_data;
7516         if (!percpu_ref_tryget(&ctx->refs))
7517                 goto out_fput;
7518
7519         /*
7520          * For SQ polling, the thread will do all submissions and completions.
7521          * Just return the requested submit count, and wake the thread if
7522          * we were asked to.
7523          */
7524         ret = 0;
7525         if (ctx->flags & IORING_SETUP_SQPOLL) {
7526                 if (!list_empty_careful(&ctx->cq_overflow_list))
7527                         io_cqring_overflow_flush(ctx, false);
7528                 if (flags & IORING_ENTER_SQ_WAKEUP)
7529                         wake_up(&ctx->sqo_wait);
7530                 submitted = to_submit;
7531         } else if (to_submit) {
7532                 mutex_lock(&ctx->uring_lock);
7533                 submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false);
7534                 mutex_unlock(&ctx->uring_lock);
7535
7536                 if (submitted != to_submit)
7537                         goto out;
7538         }
7539         if (flags & IORING_ENTER_GETEVENTS) {
7540                 unsigned nr_events = 0;
7541
7542                 min_complete = min(min_complete, ctx->cq_entries);
7543
7544                 /*
7545                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
7546                  * space applications don't need to do io completion events
7547                  * polling again, they can rely on io_sq_thread to do polling
7548                  * work, which can reduce cpu usage and uring_lock contention.
7549                  */
7550                 if (ctx->flags & IORING_SETUP_IOPOLL &&
7551                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
7552                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
7553                 } else {
7554                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
7555                 }
7556         }
7557
7558 out:
7559         percpu_ref_put(&ctx->refs);
7560 out_fput:
7561         fdput(f);
7562         return submitted ? submitted : ret;
7563 }
7564
7565 #ifdef CONFIG_PROC_FS
7566 static int io_uring_show_cred(int id, void *p, void *data)
7567 {
7568         const struct cred *cred = p;
7569         struct seq_file *m = data;
7570         struct user_namespace *uns = seq_user_ns(m);
7571         struct group_info *gi;
7572         kernel_cap_t cap;
7573         unsigned __capi;
7574         int g;
7575
7576         seq_printf(m, "%5d\n", id);
7577         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
7578         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
7579         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
7580         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
7581         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
7582         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
7583         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
7584         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
7585         seq_puts(m, "\n\tGroups:\t");
7586         gi = cred->group_info;
7587         for (g = 0; g < gi->ngroups; g++) {
7588                 seq_put_decimal_ull(m, g ? " " : "",
7589                                         from_kgid_munged(uns, gi->gid[g]));
7590         }
7591         seq_puts(m, "\n\tCapEff:\t");
7592         cap = cred->cap_effective;
7593         CAP_FOR_EACH_U32(__capi)
7594                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
7595         seq_putc(m, '\n');
7596         return 0;
7597 }
7598
7599 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
7600 {
7601         int i;
7602
7603         mutex_lock(&ctx->uring_lock);
7604         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
7605         for (i = 0; i < ctx->nr_user_files; i++) {
7606                 struct fixed_file_table *table;
7607                 struct file *f;
7608
7609                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7610                 f = table->files[i & IORING_FILE_TABLE_MASK];
7611                 if (f)
7612                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
7613                 else
7614                         seq_printf(m, "%5u: <none>\n", i);
7615         }
7616         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
7617         for (i = 0; i < ctx->nr_user_bufs; i++) {
7618                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
7619
7620                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
7621                                                 (unsigned int) buf->len);
7622         }
7623         if (!idr_is_empty(&ctx->personality_idr)) {
7624                 seq_printf(m, "Personalities:\n");
7625                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
7626         }
7627         seq_printf(m, "PollList:\n");
7628         spin_lock_irq(&ctx->completion_lock);
7629         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7630                 struct hlist_head *list = &ctx->cancel_hash[i];
7631                 struct io_kiocb *req;
7632
7633                 hlist_for_each_entry(req, list, hash_node)
7634                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
7635                                         req->task->task_works != NULL);
7636         }
7637         spin_unlock_irq(&ctx->completion_lock);
7638         mutex_unlock(&ctx->uring_lock);
7639 }
7640
7641 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
7642 {
7643         struct io_ring_ctx *ctx = f->private_data;
7644
7645         if (percpu_ref_tryget(&ctx->refs)) {
7646                 __io_uring_show_fdinfo(ctx, m);
7647                 percpu_ref_put(&ctx->refs);
7648         }
7649 }
7650 #endif
7651
7652 static const struct file_operations io_uring_fops = {
7653         .release        = io_uring_release,
7654         .flush          = io_uring_flush,
7655         .mmap           = io_uring_mmap,
7656 #ifndef CONFIG_MMU
7657         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
7658         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
7659 #endif
7660         .poll           = io_uring_poll,
7661         .fasync         = io_uring_fasync,
7662 #ifdef CONFIG_PROC_FS
7663         .show_fdinfo    = io_uring_show_fdinfo,
7664 #endif
7665 };
7666
7667 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
7668                                   struct io_uring_params *p)
7669 {
7670         struct io_rings *rings;
7671         size_t size, sq_array_offset;
7672
7673         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
7674         if (size == SIZE_MAX)
7675                 return -EOVERFLOW;
7676
7677         rings = io_mem_alloc(size);
7678         if (!rings)
7679                 return -ENOMEM;
7680
7681         ctx->rings = rings;
7682         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
7683         rings->sq_ring_mask = p->sq_entries - 1;
7684         rings->cq_ring_mask = p->cq_entries - 1;
7685         rings->sq_ring_entries = p->sq_entries;
7686         rings->cq_ring_entries = p->cq_entries;
7687         ctx->sq_mask = rings->sq_ring_mask;
7688         ctx->cq_mask = rings->cq_ring_mask;
7689         ctx->sq_entries = rings->sq_ring_entries;
7690         ctx->cq_entries = rings->cq_ring_entries;
7691
7692         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
7693         if (size == SIZE_MAX) {
7694                 io_mem_free(ctx->rings);
7695                 ctx->rings = NULL;
7696                 return -EOVERFLOW;
7697         }
7698
7699         ctx->sq_sqes = io_mem_alloc(size);
7700         if (!ctx->sq_sqes) {
7701                 io_mem_free(ctx->rings);
7702                 ctx->rings = NULL;
7703                 return -ENOMEM;
7704         }
7705
7706         return 0;
7707 }
7708
7709 /*
7710  * Allocate an anonymous fd, this is what constitutes the application
7711  * visible backing of an io_uring instance. The application mmaps this
7712  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
7713  * we have to tie this fd to a socket for file garbage collection purposes.
7714  */
7715 static int io_uring_get_fd(struct io_ring_ctx *ctx)
7716 {
7717         struct file *file;
7718         int ret;
7719
7720 #if defined(CONFIG_UNIX)
7721         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
7722                                 &ctx->ring_sock);
7723         if (ret)
7724                 return ret;
7725 #endif
7726
7727         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
7728         if (ret < 0)
7729                 goto err;
7730
7731         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
7732                                         O_RDWR | O_CLOEXEC);
7733         if (IS_ERR(file)) {
7734                 put_unused_fd(ret);
7735                 ret = PTR_ERR(file);
7736                 goto err;
7737         }
7738
7739 #if defined(CONFIG_UNIX)
7740         ctx->ring_sock->file = file;
7741 #endif
7742         fd_install(ret, file);
7743         return ret;
7744 err:
7745 #if defined(CONFIG_UNIX)
7746         sock_release(ctx->ring_sock);
7747         ctx->ring_sock = NULL;
7748 #endif
7749         return ret;
7750 }
7751
7752 static int io_uring_create(unsigned entries, struct io_uring_params *p,
7753                            struct io_uring_params __user *params)
7754 {
7755         struct user_struct *user = NULL;
7756         struct io_ring_ctx *ctx;
7757         bool account_mem;
7758         int ret;
7759
7760         if (!entries)
7761                 return -EINVAL;
7762         if (entries > IORING_MAX_ENTRIES) {
7763                 if (!(p->flags & IORING_SETUP_CLAMP))
7764                         return -EINVAL;
7765                 entries = IORING_MAX_ENTRIES;
7766         }
7767
7768         /*
7769          * Use twice as many entries for the CQ ring. It's possible for the
7770          * application to drive a higher depth than the size of the SQ ring,
7771          * since the sqes are only used at submission time. This allows for
7772          * some flexibility in overcommitting a bit. If the application has
7773          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
7774          * of CQ ring entries manually.
7775          */
7776         p->sq_entries = roundup_pow_of_two(entries);
7777         if (p->flags & IORING_SETUP_CQSIZE) {
7778                 /*
7779                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
7780                  * to a power-of-two, if it isn't already. We do NOT impose
7781                  * any cq vs sq ring sizing.
7782                  */
7783                 if (p->cq_entries < p->sq_entries)
7784                         return -EINVAL;
7785                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
7786                         if (!(p->flags & IORING_SETUP_CLAMP))
7787                                 return -EINVAL;
7788                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
7789                 }
7790                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
7791         } else {
7792                 p->cq_entries = 2 * p->sq_entries;
7793         }
7794
7795         user = get_uid(current_user());
7796         account_mem = !capable(CAP_IPC_LOCK);
7797
7798         if (account_mem) {
7799                 ret = io_account_mem(user,
7800                                 ring_pages(p->sq_entries, p->cq_entries));
7801                 if (ret) {
7802                         free_uid(user);
7803                         return ret;
7804                 }
7805         }
7806
7807         ctx = io_ring_ctx_alloc(p);
7808         if (!ctx) {
7809                 if (account_mem)
7810                         io_unaccount_mem(user, ring_pages(p->sq_entries,
7811                                                                 p->cq_entries));
7812                 free_uid(user);
7813                 return -ENOMEM;
7814         }
7815         ctx->compat = in_compat_syscall();
7816         ctx->account_mem = account_mem;
7817         ctx->user = user;
7818         ctx->creds = get_current_cred();
7819
7820         ret = io_allocate_scq_urings(ctx, p);
7821         if (ret)
7822                 goto err;
7823
7824         ret = io_sq_offload_start(ctx, p);
7825         if (ret)
7826                 goto err;
7827
7828         memset(&p->sq_off, 0, sizeof(p->sq_off));
7829         p->sq_off.head = offsetof(struct io_rings, sq.head);
7830         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
7831         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
7832         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
7833         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
7834         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
7835         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
7836
7837         memset(&p->cq_off, 0, sizeof(p->cq_off));
7838         p->cq_off.head = offsetof(struct io_rings, cq.head);
7839         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
7840         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
7841         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
7842         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
7843         p->cq_off.cqes = offsetof(struct io_rings, cqes);
7844
7845         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
7846                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
7847                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
7848
7849         if (copy_to_user(params, p, sizeof(*p))) {
7850                 ret = -EFAULT;
7851                 goto err;
7852         }
7853         /*
7854          * Install ring fd as the very last thing, so we don't risk someone
7855          * having closed it before we finish setup
7856          */
7857         ret = io_uring_get_fd(ctx);
7858         if (ret < 0)
7859                 goto err;
7860
7861         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
7862         return ret;
7863 err:
7864         io_ring_ctx_wait_and_kill(ctx);
7865         return ret;
7866 }
7867
7868 /*
7869  * Sets up an aio uring context, and returns the fd. Applications asks for a
7870  * ring size, we return the actual sq/cq ring sizes (among other things) in the
7871  * params structure passed in.
7872  */
7873 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
7874 {
7875         struct io_uring_params p;
7876         int i;
7877
7878         if (copy_from_user(&p, params, sizeof(p)))
7879                 return -EFAULT;
7880         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
7881                 if (p.resv[i])
7882                         return -EINVAL;
7883         }
7884
7885         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
7886                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
7887                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
7888                 return -EINVAL;
7889
7890         return  io_uring_create(entries, &p, params);
7891 }
7892
7893 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
7894                 struct io_uring_params __user *, params)
7895 {
7896         return io_uring_setup(entries, params);
7897 }
7898
7899 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
7900 {
7901         struct io_uring_probe *p;
7902         size_t size;
7903         int i, ret;
7904
7905         size = struct_size(p, ops, nr_args);
7906         if (size == SIZE_MAX)
7907                 return -EOVERFLOW;
7908         p = kzalloc(size, GFP_KERNEL);
7909         if (!p)
7910                 return -ENOMEM;
7911
7912         ret = -EFAULT;
7913         if (copy_from_user(p, arg, size))
7914                 goto out;
7915         ret = -EINVAL;
7916         if (memchr_inv(p, 0, size))
7917                 goto out;
7918
7919         p->last_op = IORING_OP_LAST - 1;
7920         if (nr_args > IORING_OP_LAST)
7921                 nr_args = IORING_OP_LAST;
7922
7923         for (i = 0; i < nr_args; i++) {
7924                 p->ops[i].op = i;
7925                 if (!io_op_defs[i].not_supported)
7926                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
7927         }
7928         p->ops_len = i;
7929
7930         ret = 0;
7931         if (copy_to_user(arg, p, size))
7932                 ret = -EFAULT;
7933 out:
7934         kfree(p);
7935         return ret;
7936 }
7937
7938 static int io_register_personality(struct io_ring_ctx *ctx)
7939 {
7940         const struct cred *creds = get_current_cred();
7941         int id;
7942
7943         id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
7944                                 USHRT_MAX, GFP_KERNEL);
7945         if (id < 0)
7946                 put_cred(creds);
7947         return id;
7948 }
7949
7950 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
7951 {
7952         const struct cred *old_creds;
7953
7954         old_creds = idr_remove(&ctx->personality_idr, id);
7955         if (old_creds) {
7956                 put_cred(old_creds);
7957                 return 0;
7958         }
7959
7960         return -EINVAL;
7961 }
7962
7963 static bool io_register_op_must_quiesce(int op)
7964 {
7965         switch (op) {
7966         case IORING_UNREGISTER_FILES:
7967         case IORING_REGISTER_FILES_UPDATE:
7968         case IORING_REGISTER_PROBE:
7969         case IORING_REGISTER_PERSONALITY:
7970         case IORING_UNREGISTER_PERSONALITY:
7971                 return false;
7972         default:
7973                 return true;
7974         }
7975 }
7976
7977 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
7978                                void __user *arg, unsigned nr_args)
7979         __releases(ctx->uring_lock)
7980         __acquires(ctx->uring_lock)
7981 {
7982         int ret;
7983
7984         /*
7985          * We're inside the ring mutex, if the ref is already dying, then
7986          * someone else killed the ctx or is already going through
7987          * io_uring_register().
7988          */
7989         if (percpu_ref_is_dying(&ctx->refs))
7990                 return -ENXIO;
7991
7992         if (io_register_op_must_quiesce(opcode)) {
7993                 percpu_ref_kill(&ctx->refs);
7994
7995                 /*
7996                  * Drop uring mutex before waiting for references to exit. If
7997                  * another thread is currently inside io_uring_enter() it might
7998                  * need to grab the uring_lock to make progress. If we hold it
7999                  * here across the drain wait, then we can deadlock. It's safe
8000                  * to drop the mutex here, since no new references will come in
8001                  * after we've killed the percpu ref.
8002                  */
8003                 mutex_unlock(&ctx->uring_lock);
8004                 ret = wait_for_completion_interruptible(&ctx->completions[0]);
8005                 mutex_lock(&ctx->uring_lock);
8006                 if (ret) {
8007                         percpu_ref_resurrect(&ctx->refs);
8008                         ret = -EINTR;
8009                         goto out;
8010                 }
8011         }
8012
8013         switch (opcode) {
8014         case IORING_REGISTER_BUFFERS:
8015                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
8016                 break;
8017         case IORING_UNREGISTER_BUFFERS:
8018                 ret = -EINVAL;
8019                 if (arg || nr_args)
8020                         break;
8021                 ret = io_sqe_buffer_unregister(ctx);
8022                 break;
8023         case IORING_REGISTER_FILES:
8024                 ret = io_sqe_files_register(ctx, arg, nr_args);
8025                 break;
8026         case IORING_UNREGISTER_FILES:
8027                 ret = -EINVAL;
8028                 if (arg || nr_args)
8029                         break;
8030                 ret = io_sqe_files_unregister(ctx);
8031                 break;
8032         case IORING_REGISTER_FILES_UPDATE:
8033                 ret = io_sqe_files_update(ctx, arg, nr_args);
8034                 break;
8035         case IORING_REGISTER_EVENTFD:
8036         case IORING_REGISTER_EVENTFD_ASYNC:
8037                 ret = -EINVAL;
8038                 if (nr_args != 1)
8039                         break;
8040                 ret = io_eventfd_register(ctx, arg);
8041                 if (ret)
8042                         break;
8043                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
8044                         ctx->eventfd_async = 1;
8045                 else
8046                         ctx->eventfd_async = 0;
8047                 break;
8048         case IORING_UNREGISTER_EVENTFD:
8049                 ret = -EINVAL;
8050                 if (arg || nr_args)
8051                         break;
8052                 ret = io_eventfd_unregister(ctx);
8053                 break;
8054         case IORING_REGISTER_PROBE:
8055                 ret = -EINVAL;
8056                 if (!arg || nr_args > 256)
8057                         break;
8058                 ret = io_probe(ctx, arg, nr_args);
8059                 break;
8060         case IORING_REGISTER_PERSONALITY:
8061                 ret = -EINVAL;
8062                 if (arg || nr_args)
8063                         break;
8064                 ret = io_register_personality(ctx);
8065                 break;
8066         case IORING_UNREGISTER_PERSONALITY:
8067                 ret = -EINVAL;
8068                 if (arg)
8069                         break;
8070                 ret = io_unregister_personality(ctx, nr_args);
8071                 break;
8072         default:
8073                 ret = -EINVAL;
8074                 break;
8075         }
8076
8077         if (io_register_op_must_quiesce(opcode)) {
8078                 /* bring the ctx back to life */
8079                 percpu_ref_reinit(&ctx->refs);
8080 out:
8081                 reinit_completion(&ctx->completions[0]);
8082         }
8083         return ret;
8084 }
8085
8086 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
8087                 void __user *, arg, unsigned int, nr_args)
8088 {
8089         struct io_ring_ctx *ctx;
8090         long ret = -EBADF;
8091         struct fd f;
8092
8093         f = fdget(fd);
8094         if (!f.file)
8095                 return -EBADF;
8096
8097         ret = -EOPNOTSUPP;
8098         if (f.file->f_op != &io_uring_fops)
8099                 goto out_fput;
8100
8101         ctx = f.file->private_data;
8102
8103         mutex_lock(&ctx->uring_lock);
8104         ret = __io_uring_register(ctx, opcode, arg, nr_args);
8105         mutex_unlock(&ctx->uring_lock);
8106         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
8107                                                         ctx->cq_ev_fd != NULL, ret);
8108 out_fput:
8109         fdput(f);
8110         return ret;
8111 }
8112
8113 static int __init io_uring_init(void)
8114 {
8115 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
8116         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
8117         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
8118 } while (0)
8119
8120 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
8121         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
8122         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
8123         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
8124         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
8125         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
8126         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
8127         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
8128         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
8129         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
8130         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
8131         BUILD_BUG_SQE_ELEM(24, __u32,  len);
8132         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
8133         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
8134         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
8135         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
8136         BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
8137         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
8138         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
8139         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
8140         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
8141         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
8142         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
8143         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
8144         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
8145         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
8146         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
8147         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
8148         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
8149         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
8150
8151         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
8152         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
8153         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
8154         return 0;
8155 };
8156 __initcall(io_uring_init);